You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/09 19:59:56 UTC

[tika] branch main updated: TIKA-3754 -- allow customization of ImageGraphicsEngine

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 07a59ecd6 TIKA-3754 -- allow customization of ImageGraphicsEngine
07a59ecd6 is described below

commit 07a59ecd682c8584e0b8298e2140ada065e092ce
Author: tallison <ta...@apache.org>
AuthorDate: Mon May 9 15:59:43 2022 -0400

    TIKA-3754 -- allow customization of ImageGraphicsEngine
---
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java |  4 +-
 .../java/org/apache/tika/parser/pdf/PDFParser.java |  6 ++
 .../apache/tika/parser/pdf/PDFParserConfig.java    | 20 ++++++-
 .../tika/parser/pdf/PDMetadataExtractor.java       |  4 +-
 .../pdf/{ => image}/ImageGraphicsEngine.java       | 10 ++--
 .../pdf/image/ImageGraphicsEngineFactory.java      | 42 ++++++++++++++
 .../pdf/MyCustomImageGraphicsEngineFactory.java    | 40 +++++++++++++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  | 67 ++++++++++++++--------
 .../pdf/tika-config-custom-graphics-engine.xml     | 28 +++++++++
 9 files changed, 190 insertions(+), 31 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 93d1b7e81..514a7f9dd 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -40,6 +40,7 @@ import org.xml.sax.SAXException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.image.ImageGraphicsEngine;
 
 /**
  * Utility class that overrides the {@link PDFTextStripper} functionality
@@ -154,7 +155,8 @@ class PDF2XHTML extends AbstractPDF2XHTML {
         }
 
         ImageGraphicsEngine engine =
-                new ImageGraphicsEngine(page, embeddedDocumentExtractor, config,
+                config.getImageGraphicsEngineFactory().newEngine(
+                        page, embeddedDocumentExtractor, config,
                         processedInlineImages, inlineImageCounter, xhtml, metadata, context);
         engine.run();
         List<IOException> engineExceptions = engine.getExceptions();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 1cd99b927..5ad30b37e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -66,6 +66,7 @@ import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.parser.RenderingParser;
+import org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory;
 import org.apache.tika.renderer.PageRangeRequest;
 import org.apache.tika.renderer.RenderResult;
 import org.apache.tika.renderer.RenderResults;
@@ -736,6 +737,11 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         defaultConfig.setRenderer(renderer);
     }
 
+    @Field
+    public void setImageGraphicsEngineFactory(ImageGraphicsEngineFactory imageGraphicsEngineFactory) {
+        defaultConfig.setImageGraphicsEngineFactory(imageGraphicsEngineFactory);
+    }
+
     public void setImageStrategy(String imageStrategy) {
         defaultConfig.setImageStrategy(imageStrategy);
     }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 12b10e6c1..d17f7ed88 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -30,6 +30,7 @@ import org.apache.pdfbox.rendering.ImageType;
 import org.apache.pdfbox.text.PDFTextStripper;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory;
 import org.apache.tika.renderer.Renderer;
 
 /**
@@ -74,6 +75,9 @@ public class PDFParserConfig implements Serializable {
     //extracted.
     private boolean extractInlineImageMetadataOnly = false;
 
+    private ImageGraphicsEngineFactory imageGraphicsEngineFactory =
+            new ImageGraphicsEngineFactory();
+
     //True if inline images (as identified by their object id within
     //a pdf file) should only be extracted once.
     private boolean extractUniqueInlineImagesOnly = true;
@@ -141,7 +145,7 @@ public class PDFParserConfig implements Serializable {
     /**
      * @return whether or not to extract only inline image metadata and not render the images
      */
-    boolean isExtractInlineImageMetadataOnly() {
+    public boolean isExtractInlineImageMetadataOnly() {
         return extractInlineImageMetadataOnly;
     }
 
@@ -863,6 +867,20 @@ public class PDFParserConfig implements Serializable {
         userConfigured.add("imageStrategy");
     }
 
+    /**
+     * EXPERT: Customize the class that handles inline images within a PDF page.
+     *
+     * @param imageGraphicsEngineFactory
+     */
+    public void setImageGraphicsEngineFactory(ImageGraphicsEngineFactory imageGraphicsEngineFactory) {
+        this.imageGraphicsEngineFactory = imageGraphicsEngineFactory;
+        userConfigured.add("imageGraphicsEngineFactory");
+    }
+
+    public ImageGraphicsEngineFactory getImageGraphicsEngineFactory() {
+        return imageGraphicsEngineFactory;
+    }
+
     public IMAGE_STRATEGY getImageStrategy() {
         return imageStrategy;
     }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index 5a3f85c57..573a701cb 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -52,12 +52,12 @@ import org.apache.tika.parser.xmp.JempboxExtractor;
 import org.apache.tika.utils.StringUtils;
 import org.apache.tika.utils.XMLReaderUtils;
 
-class PDMetadataExtractor {
+public class PDMetadataExtractor {
 
     private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
 
 
-    static void extract(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
+    public static void extract(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
         if (pdMetadata == null) {
             metadata.set(PDF.HAS_XMP, "false");
             return;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
similarity index 98%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
index c600c81ec..a304fe614 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.parser.pdf;
+package org.apache.tika.parser.pdf.image;
 
 import java.awt.geom.Point2D;
 import java.awt.image.BufferedImage;
@@ -65,13 +65,15 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.parser.pdf.PDMetadataExtractor;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 
 /**
  * Copied nearly verbatim from PDFBox
  */
-class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
+public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
 
     //We're currently copying images to byte[].  We should
     //limit the length to avoid OOM on crafted files.
@@ -219,7 +221,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
         return false;
     }
 
-    void run() throws IOException {
+    public void run() throws IOException {
         PDPage page = getPage();
 
         //TODO: is there a better way to do this rather than reprocessing the page
@@ -498,7 +500,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
         }
     }
 
-    List<IOException> getExceptions() {
+    public List<IOException> getExceptions() {
         return exceptions;
     }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
new file mode 100644
index 000000000..5cc033fa9
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf.image;
+
+import java.io.Serializable;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.pdmodel.PDPage;
+
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class ImageGraphicsEngineFactory implements Serializable {
+
+    public ImageGraphicsEngine newEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
+                                         PDFParserConfig pdfParserConfig,
+                                         Map<COSStream, Integer> processedInlineImages,
+                                         AtomicInteger imageCounter, XHTMLContentHandler xhtml,
+                                         Metadata parentMetadata, ParseContext parseContext) {
+        return new ImageGraphicsEngine(page, embeddedDocumentExtractor, pdfParserConfig,
+                processedInlineImages, imageCounter, xhtml, parentMetadata, parseContext);
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java
new file mode 100644
index 000000000..e7498d5b7
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.pdmodel.PDPage;
+
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.image.ImageGraphicsEngine;
+import org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class MyCustomImageGraphicsEngineFactory extends ImageGraphicsEngineFactory {
+    public ImageGraphicsEngine newEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
+                                         PDFParserConfig pdfParserConfig,
+                                         Map<COSStream, Integer> processedInlineImages,
+                                         AtomicInteger imageCounter, XHTMLContentHandler xhtml,
+                                         Metadata parentMetadata, ParseContext parseContext) {
+        throw new RuntimeException("testing123");
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 88caa8175..2ce76bec8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -22,6 +22,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
 
 import java.io.InputStream;
 import java.util.Arrays;
@@ -65,6 +66,7 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.utils.ExceptionUtils;
 
 /**
  * Test case for parsing pdf files.
@@ -1368,32 +1370,51 @@ public class PDFParserTest extends TikaTest {
         assertEquals("RM1", metadata.get(0).getValues(PDF.ANNOTATION_TYPES)[0]);
     }
 
-    /**
+
     @Test
-    public void testWriteLimit() throws Exception {
-        for (int i = 0; i < 10000; i += 13) {
-            Metadata metadata = testWriteLimit("testPDF_childAttachments.pdf", i);
-            assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
-            int len = metadata.get(TikaCoreProperties.TIKA_CONTENT).length();
-            System.out.println(len + " : " + i);
-            assertTrue(len <= i);
+    public void testCustomGraphicsEngineFactory() throws Exception {
+        try (InputStream is =
+                     getResourceAsStream(
+                             "tika-config-custom-graphics-engine.xml")) {
+            assertNotNull(is);
+            TikaConfig tikaConfig = new TikaConfig(is);
+            Parser p = new AutoDetectParser(tikaConfig);
+            try {
+                List<Metadata> metadataList = getRecursiveMetadata("testPDF_JBIG2.pdf", p);
+                fail("should have thrown a runtime exception");
+            } catch (TikaException e) {
+                String stack = ExceptionUtils.getStackTrace(e);
+                assertContains("testing123", stack);
+            }
         }
     }
 
-    private Metadata testWriteLimit(String fileName, int limit) throws Exception {
-        BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
-                BasicContentHandlerFactory.HANDLER_TYPE.TEXT, limit
-        );
-        ContentHandler contentHandler = factory.getNewContentHandler();
-        Metadata metadata = new Metadata();
-        ParseContext parseContext = new ParseContext();
-        try (InputStream is = getResourceAsStream("/test-documents/" + fileName)) {
-            AUTO_DETECT_PARSER.parse(is, contentHandler, metadata, parseContext);
-        } catch (WriteLimitReachedException e) {
-            //e.printStackTrace();
-        }
-        metadata.set(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString());
-        return metadata;
-    }*/
+            /**
+            @Test
+            public void testWriteLimit() throws Exception {
+                for (int i = 0; i < 10000; i += 13) {
+                    Metadata metadata = testWriteLimit("testPDF_childAttachments.pdf", i);
+                    assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+                    int len = metadata.get(TikaCoreProperties.TIKA_CONTENT).length();
+                    System.out.println(len + " : " + i);
+                    assertTrue(len <= i);
+                }
+            }
+
+            private Metadata testWriteLimit(String fileName, int limit) throws Exception {
+                BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
+                        BasicContentHandlerFactory.HANDLER_TYPE.TEXT, limit
+                );
+                ContentHandler contentHandler = factory.getNewContentHandler();
+                Metadata metadata = new Metadata();
+                ParseContext parseContext = new ParseContext();
+                try (InputStream is = getResourceAsStream("/test-documents/" + fileName)) {
+                    AUTO_DETECT_PARSER.parse(is, contentHandler, metadata, parseContext);
+                } catch (WriteLimitReachedException e) {
+                    //e.printStackTrace();
+                }
+                metadata.set(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString());
+                return metadata;
+            }*/
 
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml
new file mode 100644
index 000000000..5aa259fee
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <param name="sortByPosition" type="bool">true</param>
+                <param name="extractInlineImages" type="bool">true</param>
+                <param name="imageGraphicsEngineFactory" class="org.apache.tika.parser.pdf.MyCustomImageGraphicsEngineFactory"/>
+            </params>
+        </parser>
+    </parsers>
+</properties>