You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/28 15:04:20 UTC

(tika) branch main updated: TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 2bc0f9bdc TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630)
2bc0f9bdc is described below

commit 2bc0f9bdce21559f592ef71919d242974be027fb
Author: Tim Allison <ta...@apache.org>
AuthorDate: Wed Feb 28 10:04:14 2024 -0500

    TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630)
---
 tika-core/src/main/java/org/apache/tika/metadata/PDF.java        | 4 ++++
 .../main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 6 ++++++
 .../src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java | 4 ++++
 .../src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java  | 8 ++++++++
 .../java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java  | 9 +++++++++
 5 files changed, 31 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index a6c753fcd..b15c10383 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -209,6 +209,10 @@ public interface PDF {
             Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"),
                     new Property[]{ TikaCoreProperties.VERSION_COUNT });
 
+    /**
+     * This counts the number of pages that would have been OCR'd or were OCR'd depending
+     * on the OCR settings. If NO_OCR is selected, this will
+     */
     Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount");
 
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index e03e14a4f..4d0a08226 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -533,8 +533,14 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     void doOCROnCurrentPage(PDPage pdPage, PDFParserConfig.OCR_STRATEGY ocrStrategy)
             throws IOException, TikaException, SAXException {
         if (ocrStrategy.equals(NO_OCR)) {
+            //I don't think this is reachable?
             return;
         }
+        //count the number of times that OCR would have been called
+        OCRPageCounter c = context.get(OCRPageCounter.class);
+        if (c != null) {
+            c.increment();
+        }
         MediaType ocrImageMediaType = MediaType.image("ocr-" + config.getOcrImageFormatName());
         if (!ocrParser.getSupportedTypes(context).contains(ocrImageMediaType)) {
             if (ocrStrategy == OCR_ONLY || ocrStrategy == OCR_AND_TEXT_EXTRACTION) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
index 3b382099b..418419eee 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
@@ -16,6 +16,10 @@
  */
 package org.apache.tika.parser.pdf;
 
+/**
+ * This counts the number of pages that OCR would have been
+ * run or was run depending on the settings.
+ */
 public class OCRPageCounter {
 
     private int count;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 0269a58ef..9406cac53 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1432,6 +1432,14 @@ public class PDFParserTest extends TikaTest {
                 metadataList.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
 
     }
+    @Test
+    public void testDefaultPDFOCR() throws Exception {
+        //test that even with no ocr -- there is no tesseract ocr parser in this module --
+        // AUTO mode would have returned one page that would have been OCR'd had there been OCR.
+        List<Metadata> metadataList = getRecursiveMetadata("testOCR.pdf");
+        assertEquals(1, metadataList.size());
+        assertEquals(1, metadataList.get(0).getInt(PDF.OCR_PAGE_COUNT));
+    }
     /**
      * TODO -- need to test signature extraction
      */
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index d57d0e9aa..6ce19e3dd 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -30,6 +30,7 @@ import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
@@ -73,8 +74,16 @@ public class TesseractOCRParserTest extends TikaTest {
         assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
     }
 
+    @Test
+    public void testDefaultPDFOCR() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testOCR.pdf");
+        assertEquals(1, metadataList.size());
+        assertEquals(1, metadataList.get(0).getInt(PDF.OCR_PAGE_COUNT));
+    }
+
     @Test
     public void testPDFOCR() throws Exception {
+        assumeTrue(canRun(), "can run OCR");
         String resource = "testOCR.pdf";
         String[] nonOCRContains = new String[0];
         testBasicOCR(resource, nonOCRContains, 2);