You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/22 16:48:21 UTC

[tika] branch branch_1x updated (d7480a9 -> 5fad37e)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from d7480a9  TIKA-3120 -- remove whitelist/blacklist
     new e61ead6  TIKA-3122 -- extract some image metadata without rendering images
     new 5fad37e  TIKA-3122 -- merge conflict

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../tika/exception/ZeroByteFileException.java      | 15 ++++++++++
 .../apache/tika/parser/RecursiveParserWrapper.java |  6 +++-
 .../src/test/java/org/apache/tika/TikaTest.java    |  5 ++++
 .../tika/parser/pdf/ImageGraphicsEngine.java       | 34 ++++++++++++++++++++++
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java |  3 +-
 .../java/org/apache/tika/parser/pdf/PDFParser.java |  5 ++++
 .../apache/tika/parser/pdf/PDFParserConfig.java    | 31 ++++++++++++++++++++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  | 19 ++++++++++--
 8 files changed, 114 insertions(+), 4 deletions(-)

[tika] 01/02: TIKA-3122 -- extract some image metadata without rendering images

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e61ead66bac80722c18ba70f9a2e1b2d229080f4
Author: tallison <ta...@apache.org>
AuthorDate: Mon Jun 22 12:17:33 2020 -0400

    TIKA-3122 -- extract some image metadata without rendering images
---
 .../tika/exception/ZeroByteFileException.java      | 15 ++++++++++
 .../apache/tika/parser/RecursiveParserWrapper.java |  6 +++-
 .../src/test/java/org/apache/tika/TikaTest.java    |  5 ++++
 .../tika/parser/pdf/ImageGraphicsEngine.java       | 34 ++++++++++++++++++++++
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java |  3 +-
 .../java/org/apache/tika/parser/pdf/PDFParser.java |  5 ++++
 .../apache/tika/parser/pdf/PDFParserConfig.java    | 31 ++++++++++++++++++++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  | 19 ++++++++++--
 8 files changed, 114 insertions(+), 4 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
index 65e57e8..9232461 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
@@ -22,6 +22,21 @@ package org.apache.tika.exception;
  */
 public class ZeroByteFileException extends TikaException {
 
+
+    public static class IgnoreZeroByteFileException {}
+
+    //If this is in the parse context, the AutoDetectParser and the
+    //RecursiveParserWrapper should ignore zero byte files
+    //and not throw a Zero}
+    /**
+     * If this is in the {@link org.apache.tika.parser.ParseContext}, the
+     * {@link org.apache.tika.parser.AutoDetectParser} and the
+     * {@link org.apache.tika.parser.RecursiveParserWrapper} will
+     * ignore embedded files with zero-byte length inputstreams
+     */
+    public static IgnoreZeroByteFileException IGNORE_ZERO_BYTE_FILE_EXCEPTION
+            = new IgnoreZeroByteFileException();
+
     public ZeroByteFileException(String msg) {
         super(msg);
     }
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 3f38e32..e9de9ba 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser;
 
 import org.apache.tika.exception.CorruptedFileException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.ZeroByteFileException;
 import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
@@ -399,7 +400,10 @@ public class RecursiveParserWrapper extends ParserDecorator {
             } catch(CorruptedFileException e) {
                 throw e;
             } catch (TikaException e) {
-                if (catchEmbeddedExceptions) {
+                if (context.get(ZeroByteFileException.IgnoreZeroByteFileException.class) != null
+                        && e instanceof ZeroByteFileException) {
+                    //do nothing
+                } else if (catchEmbeddedExceptions) {
                     ParserUtils.recordParserFailure(this, e, metadata);
                 } else {
                     throw e;
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 5c50ea3..e21f752 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -246,6 +246,11 @@ public abstract class TikaTest {
         }
     }
 
+    protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context, boolean suppressException) throws Exception {
+        try (TikaInputStream tis = TikaInputStream.get(path)) {
+            return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, new Metadata(), suppressException);
+        }
+    }
     protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, boolean suppressException) throws Exception {
         try (TikaInputStream tis = TikaInputStream.get(path)) {
             return getRecursiveMetadata(tis, parser, new ParseContext(), new Metadata(), suppressException);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
index 95af12d..2e942f0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
@@ -41,14 +41,19 @@ import org.apache.pdfbox.util.Matrix;
 import org.apache.pdfbox.util.Vector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.TikaMemoryLimitException;
+import org.apache.tika.exception.ZeroByteFileException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.io.IOExceptionWithCause;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -97,6 +102,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
     private final Metadata parentMetadata;
     private final XHTMLContentHandler xhtml;
     private final ParseContext parseContext;
+    private final boolean extractInlineImageMetadataOnly;
 
     //TODO: this is an embarrassment of an initializer...fix
     protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
@@ -111,6 +117,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
         this.xhtml = xhtml;
         this.parentMetadata = parentMetadata;
         this.parseContext = parseContext;
+        this.extractInlineImageMetadataOnly = pdfParserConfig.getExtractInlineImageMetadataOnly();
     }
 
     void run() throws IOException {
@@ -289,6 +296,11 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
         metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
 
+        if (extractInlineImageMetadataOnly) {
+            extractInlineImageMetadataOnly(pdImage, metadata);
+            return;
+        }
+
         if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
             ByteArrayOutputStream buffer = new ByteArrayOutputStream();
             if (pdImage instanceof PDImageXObject) {
@@ -315,6 +327,28 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
 
     }
 
+    private void extractInlineImageMetadataOnly(PDImage pdImage, Metadata metadata) throws IOException, SAXException {
+        if (pdImage instanceof PDImageXObject) {
+            PDMetadataExtractor.extract(((PDImageXObject) pdImage).getMetadata(),
+                    metadata, parseContext);
+        }
+        metadata.set(Metadata.IMAGE_WIDTH, pdImage.getWidth());
+        metadata.set(Metadata.IMAGE_LENGTH, pdImage.getHeight());
+        //TODO: what else can we extract from the PDImage without rendering?
+        ZeroByteFileException.IgnoreZeroByteFileException before =
+                parseContext.get(ZeroByteFileException.IgnoreZeroByteFileException.class);
+        try {
+            parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
+                    ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION);
+            embeddedDocumentExtractor.parseEmbedded(TikaInputStream.get(new byte[0]),
+                    new EmbeddedContentHandler(xhtml), metadata, false);
+        } finally {
+            //replace whatever was there before
+            parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
+                    before);
+        }
+    }
+
     private String getSuffix(PDImage pdImage, Metadata metadata) throws IOException {
         String suffix = pdImage.getSuffix();
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 8c2f3f2..572087d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -147,7 +147,8 @@ class PDF2XHTML extends AbstractPDF2XHTML {
     }
 
     void extractImages(PDPage page) throws SAXException, IOException {
-        if (config.getExtractInlineImages() == false) {
+        if (config.getExtractInlineImages() == false
+                && config.getExtractInlineImageMetadataOnly() == false) {
             return;
         }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 6d8b5b1..3b36c99 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -524,6 +524,11 @@ public class PDFParser extends AbstractParser implements Initializable {
     }
 
     @Field
+    void setExtractInlineImageMetadataOnly(boolean extractInlineImageMetadataOnly) {
+        defaultConfig.setExtractInlineImageMetadataOnly(extractInlineImageMetadataOnly);
+    }
+
+    @Field
     void setAverageCharTolerance(float averageCharTolerance) {
         defaultConfig.setAverageCharTolerance(averageCharTolerance);
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index da8b309..9613781 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -105,6 +105,10 @@ public class PDFParserConfig implements Serializable {
     //True if inline PDXImage objects should be extracted
     private boolean extractInlineImages = false;
 
+    //True if inline images should only have their metadata
+    //extracted.
+    private boolean extractInlineImageMetadataOnly = false;
+
     //True if inline images (as identified by their object id within
     //a pdf file) should only be extracted once.
     private boolean extractUniqueInlineImagesOnly = true;
@@ -215,6 +219,10 @@ public class PDFParserConfig implements Serializable {
         setExtractUniqueInlineImagesOnly(
                 getBooleanProp(props.getProperty("extractUniqueInlineImagesOnly"),
                         getExtractUniqueInlineImagesOnly()));
+        setExtractInlineImageMetadataOnly(
+                getBooleanProp(props.getProperty("extractInlineImageMetadataOnly"),
+                        getExtractInlineImageMetadataOnly())
+        );
         setExtractFontNames(
                 getBooleanProp(props.getProperty("extractFontNames"),
                         getExtractFontNames()));
@@ -264,6 +272,29 @@ public class PDFParserConfig implements Serializable {
     }
 
     /**
+     * Use this when you want to know how many images of what formats are in a PDF
+     * but you don't need to render the images (e.g. for OCR).  This is far
+     * faster than {@link #extractInlineImages} because it doesn't have to render the
+     * images, which can be very slow.  This does not extract metadata from
+     * within each image, rather it extracts the XMP that may be stored
+     * external to an image in PDImageXObjects.
+     *
+     * @param extractInlineImageMetadataOnly
+     * @since 1.25
+     */
+    void setExtractInlineImageMetadataOnly(boolean extractInlineImageMetadataOnly) {
+        this.extractInlineImageMetadataOnly = extractInlineImageMetadataOnly;
+    }
+
+    /**
+     *
+     * @return whether or not to extract only inline image metadata and not render the images
+     */
+    boolean getExtractInlineImageMetadataOnly() {
+        return extractInlineImageMetadataOnly;
+    }
+
+    /**
      * If the PDF contains marked content, try to extract text and its marked structure.
      * If the PDF does not contain marked content, backoff to the regular PDF2XHTML for
      * text extraction.  As of 1.24, this is an "alpha" version.
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index f9cbffd..9e267dd 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -26,7 +26,6 @@ import static org.junit.Assert.fail;
 import static org.junit.Assume.assumeTrue;
 
 import java.io.InputStream;
-import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -47,6 +46,7 @@ import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.AccessPermissionException;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.ZeroByteFileException;
 import org.apache.tika.extractor.ContainerExtractor;
 import org.apache.tika.extractor.DocumentSelector;
 import org.apache.tika.extractor.ParserContainerExtractor;
@@ -1596,6 +1596,22 @@ public class PDFParserTest extends TikaTest {
         assertEquals("Hewlett-Packard MFP", m.get(XMP.CREATOR_TOOL));
         assertEquals("1998-08-29T13:53:15Z", m.get(XMP.CREATE_DATE));
     }
+
+    @Test
+    public void testExtractInlineImageMetadata() throws Exception {
+        ParseContext context = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImageMetadataOnly(true);
+        context.set(PDFParserConfig.class, config);
+        List<Metadata> metadataList = getRecursiveMetadata("testOCR.pdf", context);
+        assertNull(context.get(ZeroByteFileException.IgnoreZeroByteFileException.class));
+        assertEquals(2, metadataList.size());
+        assertEquals("image/png", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals("/image0.png", metadataList.get(1).get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
+        assertEquals(261, (int)metadataList.get(1).getInt(Metadata.IMAGE_LENGTH));
+        assertEquals(934, (int)metadataList.get(1).getInt(Metadata.IMAGE_WIDTH));
+        assertEquals("image0.png", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+    }
     /**
      * Simple class to count end of document events.  If functionality is useful,
      * move to org.apache.tika in src/test
@@ -1625,5 +1641,4 @@ public class PDFParserTest extends TikaTest {
         }
     }
 
-
 }

[tika] 02/02: TIKA-3122 -- merge conflict

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5fad37ed673af36e90e303b2dfa78ec6f692cb39
Author: tallison <ta...@apache.org>
AuthorDate: Mon Jun 22 12:48:05 2020 -0400

    TIKA-3122 -- merge conflict
---
 .../src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 9e267dd..57b6eb2 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1610,7 +1610,7 @@ public class PDFParserTest extends TikaTest {
         assertEquals("/image0.png", metadataList.get(1).get(RecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
         assertEquals(261, (int)metadataList.get(1).getInt(Metadata.IMAGE_LENGTH));
         assertEquals(934, (int)metadataList.get(1).getInt(Metadata.IMAGE_WIDTH));
-        assertEquals("image0.png", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertEquals("image0.png", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
     }
     /**
      * Simple class to count end of document events.  If functionality is useful,