You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/11 13:39:30 UTC

[tika] branch main updated: TIKA-3754 -- allow easier customization of ImageGraphicsEngine, add page number

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 974506272 TIKA-3754 -- allow easier customization of ImageGraphicsEngine, add page number
974506272 is described below

commit 974506272f12686afdd92366c596babd1b279d97
Author: tallison <ta...@apache.org>
AuthorDate: Wed May 11 09:39:18 2022 -0400

    TIKA-3754 -- allow easier customization of ImageGraphicsEngine, add page number
---
 .../src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java    |  2 +-
 .../org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java  | 10 +++++++++-
 .../tika/parser/pdf/image/ImageGraphicsEngineFactory.java      |  6 ++++--
 .../test/java/org/apache/tika/parser/pdf/PDFParserTest.java    |  4 +++-
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 514a7f9dd..fb2de2bc2 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -156,7 +156,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
 
         ImageGraphicsEngine engine =
                 config.getImageGraphicsEngineFactory().newEngine(
-                        page, embeddedDocumentExtractor, config,
+                        page, getCurrentPageNo(), embeddedDocumentExtractor, config,
                         processedInlineImages, inlineImageCounter, xhtml, metadata, context);
         engine.run();
         List<IOException> engineExceptions = engine.getExceptions();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
index 8a9de293c..769218a6a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
@@ -64,6 +64,7 @@ import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.parser.pdf.PDMetadataExtractor;
@@ -89,6 +90,7 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
     protected static final List<String> JB2 =
             Collections.singletonList(COSName.JBIG2_DECODE.getName());
     final List<IOException> exceptions = new ArrayList<>();
+    protected final int pageNumber;
     protected final EmbeddedDocumentExtractor embeddedDocumentExtractor;
     protected final PDFParserConfig pdfParserConfig;
     protected final Map<COSStream, Integer> processedInlineImages;
@@ -101,12 +103,15 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
     protected boolean useDirectJPEG = false;
 
     //TODO: this is an embarrassment of an initializer...fix
-    protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
+    protected ImageGraphicsEngine(PDPage page,
+                                  int pageNumber,
+                                  EmbeddedDocumentExtractor embeddedDocumentExtractor,
                                   PDFParserConfig pdfParserConfig,
                                   Map<COSStream, Integer> processedInlineImages,
                                   AtomicInteger imageCounter, XHTMLContentHandler xhtml,
                                   Metadata parentMetadata, ParseContext parseContext) {
         super(page);
+        this.pageNumber = pageNumber;
         this.embeddedDocumentExtractor = embeddedDocumentExtractor;
         this.pdfParserConfig = pdfParserConfig;
         this.processedInlineImages = processedInlineImages;
@@ -395,6 +400,9 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
         metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
         metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+        metadata.set(TikaPagedText.PAGE_NUMBER, pageNumber);
+
+        //TODO -- should we look for image rotation metadata in the PDImage or elsewhere?
 
         if (extractInlineImageMetadataOnly) {
             extractInlineImageMetadataOnly(pdImage, metadata);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
index 5cc033fa9..db30f3b4b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
@@ -31,12 +31,14 @@ import org.apache.tika.sax.XHTMLContentHandler;
 
 public class ImageGraphicsEngineFactory implements Serializable {
 
-    public ImageGraphicsEngine newEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
+    public ImageGraphicsEngine newEngine(PDPage page,
+                                         int pageNumber,
+                                         EmbeddedDocumentExtractor embeddedDocumentExtractor,
                                          PDFParserConfig pdfParserConfig,
                                          Map<COSStream, Integer> processedInlineImages,
                                          AtomicInteger imageCounter, XHTMLContentHandler xhtml,
                                          Metadata parentMetadata, ParseContext parseContext) {
-        return new ImageGraphicsEngine(page, embeddedDocumentExtractor, pdfParserConfig,
+        return new ImageGraphicsEngine(page, pageNumber, embeddedDocumentExtractor, pdfParserConfig,
                 processedInlineImages, imageCounter, xhtml, parentMetadata, parseContext);
     }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 2ce76bec8..645aab611 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -56,6 +56,7 @@ import org.apache.tika.metadata.Font;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.PDF;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
 import org.apache.tika.metadata.XMP;
 import org.apache.tika.metadata.XMPMM;
 import org.apache.tika.mime.MediaType;
@@ -715,7 +716,8 @@ public class PDFParserTest extends TikaTest {
         }
         assertEquals(2, inline);
         assertEquals(2, attach);
-
+        assertEquals(1, metadatas.get(1).getInt(TikaPagedText.PAGE_NUMBER));
+        assertEquals(66, metadatas.get(2).getInt(TikaPagedText.PAGE_NUMBER));
         //now try turning off inline
 
         context.set(org.apache.tika.extractor.DocumentSelector.class, new AvoidInlineSelector());