You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/23 18:30:37 UTC

(tika) branch TIKA-4202 created (now 83cc605a5)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4202
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 83cc605a5 TIKA-4202 -- add ocr page count to PDFs

This branch includes the following new commits:

     new 83cc605a5 TIKA-4202 -- add ocr page count to PDFs

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



(tika) 01/01: TIKA-4202 -- add ocr page count to PDFs

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4202
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 83cc605a5c8f2fef735ff4f1a8f9aa676821273b
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 23 13:30:23 2024 -0500

    TIKA-4202 -- add ocr page count to PDFs
---
 tika-core/src/main/java/org/apache/tika/metadata/PDF.java  |  2 ++
 .../java/org/apache/tika/parser/pdf/OCRPageCounter.java    | 14 ++++++++++++++
 .../main/java/org/apache/tika/parser/pdf/PDFParser.java    |  6 ++++++
 .../java/org/apache/tika/parser/pdf/PDFParserTest.java     |  2 +-
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index c2baca0e8..a6c753fcd 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -209,4 +209,6 @@ public interface PDF {
             Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"),
                     new Property[]{ TikaCoreProperties.VERSION_COUNT });
 
+    Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount");
+
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
new file mode 100644
index 000000000..d3dcc9155
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
@@ -0,0 +1,14 @@
+package org.apache.tika.parser.pdf;
+
+public class OCRPageCounter {
+
+    private int count;
+
+    public void increment() {
+        count++;
+    }
+
+    public int getCount() {
+        return count;
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index c93571daf..f21b65d4e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.parser.pdf;
 
+import static org.apache.tika.metadata.PDF.OCR_PAGE_COUNT;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Path;
@@ -158,6 +160,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable {
         PDFRenderingState incomingRenderingState = context.get(PDFRenderingState.class);
         TikaInputStream tstream = null;
         boolean shouldClose = false;
+        OCRPageCounter prevOCRCounter = context.get(OCRPageCounter.class);
+        context.set(OCRPageCounter.class, new OCRPageCounter());
         try {
             if (shouldSpool(localConfig)) {
                 if (stream instanceof TikaInputStream) {
@@ -220,6 +224,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable {
             metadata.set(PDF.IS_ENCRYPTED, "true");
             throw new EncryptedDocumentException(e);
         } finally {
+            metadata.set(OCR_PAGE_COUNT, context.get(OCRPageCounter.class).getCount());
+            context.set(OCRPageCounter.class, prevOCRCounter);
             //reset the incrementalUpdateRecord even if null
             context.set(IncrementalUpdateRecord.class, incomingIncrementalUpdateRecord);
             PDFRenderingState currState = context.get(PDFRenderingState.class);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 6e9167f37..0269a58ef 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -238,7 +238,7 @@ public class PDFParserTest extends TikaTest {
         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("true", metadata.get("pdf:encrypted"));
         //pdf:encrypted, X-Parsed-By and Content-Type
-        assertEquals(4, metadata.names().length, "very little metadata should be parsed");
+        assertEquals(5, metadata.names().length, "very little metadata should be parsed");
         assertEquals(0, handler.toString().length());
     }