You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/28 15:04:20 UTC
(tika) branch main updated: TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2bc0f9bdc TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630)
2bc0f9bdc is described below
commit 2bc0f9bdce21559f592ef71919d242974be027fb
Author: Tim Allison <ta...@apache.org>
AuthorDate: Wed Feb 28 10:04:14 2024 -0500
TIKA-4202 -- add ocr page count to PDFs -- actually increment counter and move the location of the counter to before OCR is invoked (#1630)
---
tika-core/src/main/java/org/apache/tika/metadata/PDF.java | 4 ++++
.../main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 6 ++++++
.../src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java | 4 ++++
.../src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java | 8 ++++++++
.../java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java | 9 +++++++++
5 files changed, 31 insertions(+)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index a6c753fcd..b15c10383 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -209,6 +209,10 @@ public interface PDF {
Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"),
new Property[]{ TikaCoreProperties.VERSION_COUNT });
+ /**
+ * This counts the number of pages that would have been OCR'd or were OCR'd depending
+ * on the OCR settings. If NO_OCR is selected, this will
+ */
Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount");
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index e03e14a4f..4d0a08226 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -533,8 +533,14 @@ class AbstractPDF2XHTML extends PDFTextStripper {
void doOCROnCurrentPage(PDPage pdPage, PDFParserConfig.OCR_STRATEGY ocrStrategy)
throws IOException, TikaException, SAXException {
if (ocrStrategy.equals(NO_OCR)) {
+ //I don't think this is reachable?
return;
}
+ //count the number of times that OCR would have been called
+ OCRPageCounter c = context.get(OCRPageCounter.class);
+ if (c != null) {
+ c.increment();
+ }
MediaType ocrImageMediaType = MediaType.image("ocr-" + config.getOcrImageFormatName());
if (!ocrParser.getSupportedTypes(context).contains(ocrImageMediaType)) {
if (ocrStrategy == OCR_ONLY || ocrStrategy == OCR_AND_TEXT_EXTRACTION) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
index 3b382099b..418419eee 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
@@ -16,6 +16,10 @@
*/
package org.apache.tika.parser.pdf;
+/**
+ * This counts the number of pages that OCR would have been
+ * run or was run depending on the settings.
+ */
public class OCRPageCounter {
private int count;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 0269a58ef..9406cac53 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1432,6 +1432,14 @@ public class PDFParserTest extends TikaTest {
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
}
+ @Test
+ public void testDefaultPDFOCR() throws Exception {
+ //test that even with no ocr -- there is no tesseract ocr parser in this module --
+ // AUTO mode would have returned one page that would have been OCR'd had there been OCR.
+ List<Metadata> metadataList = getRecursiveMetadata("testOCR.pdf");
+ assertEquals(1, metadataList.size());
+ assertEquals(1, metadataList.get(0).getInt(PDF.OCR_PAGE_COUNT));
+ }
/**
* TODO -- need to test signature extraction
*/
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index d57d0e9aa..6ce19e3dd 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -30,6 +30,7 @@ import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
@@ -73,8 +74,16 @@ public class TesseractOCRParserTest extends TikaTest {
assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
}
+ @Test
+ public void testDefaultPDFOCR() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testOCR.pdf");
+ assertEquals(1, metadataList.size());
+ assertEquals(1, metadataList.get(0).getInt(PDF.OCR_PAGE_COUNT));
+ }
+
@Test
public void testPDFOCR() throws Exception {
+ assumeTrue(canRun(), "can run OCR");
String resource = "testOCR.pdf";
String[] nonOCRContains = new String[0];
testBasicOCR(resource, nonOCRContains, 2);