You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/23 18:30:38 UTC

(tika) 01/01: TIKA-4202 -- add ocr page count to PDFs

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4202
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 83cc605a5c8f2fef735ff4f1a8f9aa676821273b
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 23 13:30:23 2024 -0500

    TIKA-4202 -- add ocr page count to PDFs
---
 tika-core/src/main/java/org/apache/tika/metadata/PDF.java  |  2 ++
 .../java/org/apache/tika/parser/pdf/OCRPageCounter.java    | 14 ++++++++++++++
 .../main/java/org/apache/tika/parser/pdf/PDFParser.java    |  6 ++++++
 .../java/org/apache/tika/parser/pdf/PDFParserTest.java     |  2 +-
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index c2baca0e8..a6c753fcd 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -209,4 +209,6 @@ public interface PDF {
             Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"),
                     new Property[]{ TikaCoreProperties.VERSION_COUNT });
 
+    Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount");
+
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
new file mode 100644
index 000000000..d3dcc9155
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
@@ -0,0 +1,14 @@
+package org.apache.tika.parser.pdf;
+
+public class OCRPageCounter {
+
+    private int count;
+
+    public void increment() {
+        count++;
+    }
+
+    public int getCount() {
+        return count;
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index c93571daf..f21b65d4e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.parser.pdf;
 
+import static org.apache.tika.metadata.PDF.OCR_PAGE_COUNT;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Path;
@@ -158,6 +160,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable {
         PDFRenderingState incomingRenderingState = context.get(PDFRenderingState.class);
         TikaInputStream tstream = null;
         boolean shouldClose = false;
+        OCRPageCounter prevOCRCounter = context.get(OCRPageCounter.class);
+        context.set(OCRPageCounter.class, new OCRPageCounter());
         try {
             if (shouldSpool(localConfig)) {
                 if (stream instanceof TikaInputStream) {
@@ -220,6 +224,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable {
             metadata.set(PDF.IS_ENCRYPTED, "true");
             throw new EncryptedDocumentException(e);
         } finally {
+            metadata.set(OCR_PAGE_COUNT, context.get(OCRPageCounter.class).getCount());
+            context.set(OCRPageCounter.class, prevOCRCounter);
             //reset the incrementalUpdateRecord even if null
             context.set(IncrementalUpdateRecord.class, incomingIncrementalUpdateRecord);
             PDFRenderingState currState = context.get(PDFRenderingState.class);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 6e9167f37..0269a58ef 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -238,7 +238,7 @@ public class PDFParserTest extends TikaTest {
         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("true", metadata.get("pdf:encrypted"));
         //pdf:encrypted, X-Parsed-By and Content-Type
-        assertEquals(4, metadata.names().length, "very little metadata should be parsed");
+        assertEquals(5, metadata.names().length, "very little metadata should be parsed");
         assertEquals(0, handler.toString().length());
     }