You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/09 19:59:56 UTC
[tika] branch main updated: TIKA-3754 -- allow customization of ImageGraphicsEngine
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 07a59ecd6 TIKA-3754 -- allow customization of ImageGraphicsEngine
07a59ecd6 is described below
commit 07a59ecd682c8584e0b8298e2140ada065e092ce
Author: tallison <ta...@apache.org>
AuthorDate: Mon May 9 15:59:43 2022 -0400
TIKA-3754 -- allow customization of ImageGraphicsEngine
---
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 4 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 6 ++
.../apache/tika/parser/pdf/PDFParserConfig.java | 20 ++++++-
.../tika/parser/pdf/PDMetadataExtractor.java | 4 +-
.../pdf/{ => image}/ImageGraphicsEngine.java | 10 ++--
.../pdf/image/ImageGraphicsEngineFactory.java | 42 ++++++++++++++
.../pdf/MyCustomImageGraphicsEngineFactory.java | 40 +++++++++++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 67 ++++++++++++++--------
.../pdf/tika-config-custom-graphics-engine.xml | 28 +++++++++
9 files changed, 190 insertions(+), 31 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 93d1b7e81..514a7f9dd 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -40,6 +40,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.image.ImageGraphicsEngine;
/**
* Utility class that overrides the {@link PDFTextStripper} functionality
@@ -154,7 +155,8 @@ class PDF2XHTML extends AbstractPDF2XHTML {
}
ImageGraphicsEngine engine =
- new ImageGraphicsEngine(page, embeddedDocumentExtractor, config,
+ config.getImageGraphicsEngineFactory().newEngine(
+ page, embeddedDocumentExtractor, config,
processedInlineImages, inlineImageCounter, xhtml, metadata, context);
engine.run();
List<IOException> engineExceptions = engine.getExceptions();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 1cd99b927..5ad30b37e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -66,6 +66,7 @@ import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RenderingParser;
+import org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory;
import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderResult;
import org.apache.tika.renderer.RenderResults;
@@ -736,6 +737,11 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
defaultConfig.setRenderer(renderer);
}
+ @Field
+ public void setImageGraphicsEngineFactory(ImageGraphicsEngineFactory imageGraphicsEngineFactory) {
+ defaultConfig.setImageGraphicsEngineFactory(imageGraphicsEngineFactory);
+ }
+
public void setImageStrategy(String imageStrategy) {
defaultConfig.setImageStrategy(imageStrategy);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 12b10e6c1..d17f7ed88 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -30,6 +30,7 @@ import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory;
import org.apache.tika.renderer.Renderer;
/**
@@ -74,6 +75,9 @@ public class PDFParserConfig implements Serializable {
//extracted.
private boolean extractInlineImageMetadataOnly = false;
+ private ImageGraphicsEngineFactory imageGraphicsEngineFactory =
+ new ImageGraphicsEngineFactory();
+
//True if inline images (as identified by their object id within
//a pdf file) should only be extracted once.
private boolean extractUniqueInlineImagesOnly = true;
@@ -141,7 +145,7 @@ public class PDFParserConfig implements Serializable {
/**
* @return whether or not to extract only inline image metadata and not render the images
*/
- boolean isExtractInlineImageMetadataOnly() {
+ public boolean isExtractInlineImageMetadataOnly() {
return extractInlineImageMetadataOnly;
}
@@ -863,6 +867,20 @@ public class PDFParserConfig implements Serializable {
userConfigured.add("imageStrategy");
}
+ /**
+ * EXPERT: Customize the class that handles inline images within a PDF page.
+ *
+ * @param imageGraphicsEngineFactory
+ */
+ public void setImageGraphicsEngineFactory(ImageGraphicsEngineFactory imageGraphicsEngineFactory) {
+ this.imageGraphicsEngineFactory = imageGraphicsEngineFactory;
+ userConfigured.add("imageGraphicsEngineFactory");
+ }
+
+ public ImageGraphicsEngineFactory getImageGraphicsEngineFactory() {
+ return imageGraphicsEngineFactory;
+ }
+
public IMAGE_STRATEGY getImageStrategy() {
return imageStrategy;
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index 5a3f85c57..573a701cb 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -52,12 +52,12 @@ import org.apache.tika.parser.xmp.JempboxExtractor;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
-class PDMetadataExtractor {
+public class PDMetadataExtractor {
private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
- static void extract(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
+ public static void extract(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
if (pdMetadata == null) {
metadata.set(PDF.HAS_XMP, "false");
return;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
similarity index 98%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
index c600c81ec..a304fe614 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pdf;
+package org.apache.tika.parser.pdf.image;
import java.awt.geom.Point2D;
import java.awt.image.BufferedImage;
@@ -65,13 +65,15 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.parser.pdf.PDMetadataExtractor;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Copied nearly verbatim from PDFBox
*/
-class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
+public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
//We're currently copying images to byte[]. We should
//limit the length to avoid OOM on crafted files.
@@ -219,7 +221,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
return false;
}
- void run() throws IOException {
+ public void run() throws IOException {
PDPage page = getPage();
//TODO: is there a better way to do this rather than reprocessing the page
@@ -498,7 +500,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
}
}
- List<IOException> getExceptions() {
+ public List<IOException> getExceptions() {
return exceptions;
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
new file mode 100644
index 000000000..5cc033fa9
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf.image;
+
+import java.io.Serializable;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.pdmodel.PDPage;
+
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class ImageGraphicsEngineFactory implements Serializable {
+
+ public ImageGraphicsEngine newEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
+ PDFParserConfig pdfParserConfig,
+ Map<COSStream, Integer> processedInlineImages,
+ AtomicInteger imageCounter, XHTMLContentHandler xhtml,
+ Metadata parentMetadata, ParseContext parseContext) {
+ return new ImageGraphicsEngine(page, embeddedDocumentExtractor, pdfParserConfig,
+ processedInlineImages, imageCounter, xhtml, parentMetadata, parseContext);
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java
new file mode 100644
index 000000000..e7498d5b7
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/MyCustomImageGraphicsEngineFactory.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.pdmodel.PDPage;
+
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.image.ImageGraphicsEngine;
+import org.apache.tika.parser.pdf.image.ImageGraphicsEngineFactory;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+public class MyCustomImageGraphicsEngineFactory extends ImageGraphicsEngineFactory {
+ public ImageGraphicsEngine newEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
+ PDFParserConfig pdfParserConfig,
+ Map<COSStream, Integer> processedInlineImages,
+ AtomicInteger imageCounter, XHTMLContentHandler xhtml,
+ Metadata parentMetadata, ParseContext parseContext) {
+ throw new RuntimeException("testing123");
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 88caa8175..2ce76bec8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -22,6 +22,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
import java.io.InputStream;
import java.util.Arrays;
@@ -65,6 +66,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
+import org.apache.tika.utils.ExceptionUtils;
/**
* Test case for parsing pdf files.
@@ -1368,32 +1370,51 @@ public class PDFParserTest extends TikaTest {
assertEquals("RM1", metadata.get(0).getValues(PDF.ANNOTATION_TYPES)[0]);
}
- /**
+
@Test
- public void testWriteLimit() throws Exception {
- for (int i = 0; i < 10000; i += 13) {
- Metadata metadata = testWriteLimit("testPDF_childAttachments.pdf", i);
- assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
- int len = metadata.get(TikaCoreProperties.TIKA_CONTENT).length();
- System.out.println(len + " : " + i);
- assertTrue(len <= i);
+ public void testCustomGraphicsEngineFactory() throws Exception {
+ try (InputStream is =
+ getResourceAsStream(
+ "tika-config-custom-graphics-engine.xml")) {
+ assertNotNull(is);
+ TikaConfig tikaConfig = new TikaConfig(is);
+ Parser p = new AutoDetectParser(tikaConfig);
+ try {
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF_JBIG2.pdf", p);
+ fail("should have thrown a runtime exception");
+ } catch (TikaException e) {
+ String stack = ExceptionUtils.getStackTrace(e);
+ assertContains("testing123", stack);
+ }
}
}
- private Metadata testWriteLimit(String fileName, int limit) throws Exception {
- BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
- BasicContentHandlerFactory.HANDLER_TYPE.TEXT, limit
- );
- ContentHandler contentHandler = factory.getNewContentHandler();
- Metadata metadata = new Metadata();
- ParseContext parseContext = new ParseContext();
- try (InputStream is = getResourceAsStream("/test-documents/" + fileName)) {
- AUTO_DETECT_PARSER.parse(is, contentHandler, metadata, parseContext);
- } catch (WriteLimitReachedException e) {
- //e.printStackTrace();
- }
- metadata.set(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString());
- return metadata;
- }*/
+ /**
+ @Test
+ public void testWriteLimit() throws Exception {
+ for (int i = 0; i < 10000; i += 13) {
+ Metadata metadata = testWriteLimit("testPDF_childAttachments.pdf", i);
+ assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+ int len = metadata.get(TikaCoreProperties.TIKA_CONTENT).length();
+ System.out.println(len + " : " + i);
+ assertTrue(len <= i);
+ }
+ }
+
+ private Metadata testWriteLimit(String fileName, int limit) throws Exception {
+ BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, limit
+ );
+ ContentHandler contentHandler = factory.getNewContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext parseContext = new ParseContext();
+ try (InputStream is = getResourceAsStream("/test-documents/" + fileName)) {
+ AUTO_DETECT_PARSER.parse(is, contentHandler, metadata, parseContext);
+ } catch (WriteLimitReachedException e) {
+ //e.printStackTrace();
+ }
+ metadata.set(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString());
+ return metadata;
+ }*/
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml
new file mode 100644
index 000000000..5aa259fee
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-config-custom-graphics-engine.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="sortByPosition" type="bool">true</param>
+ <param name="extractInlineImages" type="bool">true</param>
+ <param name="imageGraphicsEngineFactory" class="org.apache.tika.parser.pdf.MyCustomImageGraphicsEngineFactory"/>
+ </params>
+ </parser>
+ </parsers>
+</properties>