You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/11 13:39:30 UTC
[tika] branch main updated: TIKA-3754 -- allow easier customization of ImageGraphicsEngine, add page number
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 974506272 TIKA-3754 -- allow easier customization of ImageGraphicsEngine, add page number
974506272 is described below
commit 974506272f12686afdd92366c596babd1b279d97
Author: tallison <ta...@apache.org>
AuthorDate: Wed May 11 09:39:18 2022 -0400
TIKA-3754 -- allow easier customization of ImageGraphicsEngine, add page number
---
.../src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java | 2 +-
.../org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java | 10 +++++++++-
.../tika/parser/pdf/image/ImageGraphicsEngineFactory.java | 6 ++++--
.../test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 4 +++-
4 files changed, 17 insertions(+), 5 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 514a7f9dd..fb2de2bc2 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -156,7 +156,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
ImageGraphicsEngine engine =
config.getImageGraphicsEngineFactory().newEngine(
- page, embeddedDocumentExtractor, config,
+ page, getCurrentPageNo(), embeddedDocumentExtractor, config,
processedInlineImages, inlineImageCounter, xhtml, metadata, context);
engine.run();
List<IOException> engineExceptions = engine.getExceptions();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
index 8a9de293c..769218a6a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
@@ -64,6 +64,7 @@ import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.parser.pdf.PDMetadataExtractor;
@@ -89,6 +90,7 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
protected static final List<String> JB2 =
Collections.singletonList(COSName.JBIG2_DECODE.getName());
final List<IOException> exceptions = new ArrayList<>();
+ protected final int pageNumber;
protected final EmbeddedDocumentExtractor embeddedDocumentExtractor;
protected final PDFParserConfig pdfParserConfig;
protected final Map<COSStream, Integer> processedInlineImages;
@@ -101,12 +103,15 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
protected boolean useDirectJPEG = false;
//TODO: this is an embarrassment of an initializer...fix
- protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
+ protected ImageGraphicsEngine(PDPage page,
+ int pageNumber,
+ EmbeddedDocumentExtractor embeddedDocumentExtractor,
PDFParserConfig pdfParserConfig,
Map<COSStream, Integer> processedInlineImages,
AtomicInteger imageCounter, XHTMLContentHandler xhtml,
Metadata parentMetadata, ParseContext parseContext) {
super(page);
+ this.pageNumber = pageNumber;
this.embeddedDocumentExtractor = embeddedDocumentExtractor;
this.pdfParserConfig = pdfParserConfig;
this.processedInlineImages = processedInlineImages;
@@ -395,6 +400,9 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+ metadata.set(TikaPagedText.PAGE_NUMBER, pageNumber);
+
+ //TODO -- should we look for image rotation metadata in the PDImage or elsewhere?
if (extractInlineImageMetadataOnly) {
extractInlineImageMetadataOnly(pdImage, metadata);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
index 5cc033fa9..db30f3b4b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngineFactory.java
@@ -31,12 +31,14 @@ import org.apache.tika.sax.XHTMLContentHandler;
public class ImageGraphicsEngineFactory implements Serializable {
- public ImageGraphicsEngine newEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
+ public ImageGraphicsEngine newEngine(PDPage page,
+ int pageNumber,
+ EmbeddedDocumentExtractor embeddedDocumentExtractor,
PDFParserConfig pdfParserConfig,
Map<COSStream, Integer> processedInlineImages,
AtomicInteger imageCounter, XHTMLContentHandler xhtml,
Metadata parentMetadata, ParseContext parseContext) {
- return new ImageGraphicsEngine(page, embeddedDocumentExtractor, pdfParserConfig,
+ return new ImageGraphicsEngine(page, pageNumber, embeddedDocumentExtractor, pdfParserConfig,
processedInlineImages, imageCounter, xhtml, parentMetadata, parseContext);
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 2ce76bec8..645aab611 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -56,6 +56,7 @@ import org.apache.tika.metadata.Font;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.metadata.XMP;
import org.apache.tika.metadata.XMPMM;
import org.apache.tika.mime.MediaType;
@@ -715,7 +716,8 @@ public class PDFParserTest extends TikaTest {
}
assertEquals(2, inline);
assertEquals(2, attach);
-
+ assertEquals(1, metadatas.get(1).getInt(TikaPagedText.PAGE_NUMBER));
+ assertEquals(66, metadatas.get(2).getInt(TikaPagedText.PAGE_NUMBER));
//now try turning off inline
context.set(org.apache.tika.extractor.DocumentSelector.class, new AvoidInlineSelector());