You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/23 20:54:30 UTC
(tika) branch main updated: TIKA-4202 -- add ocr page count to metadata for PDFs (#1621)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 72927ec17 TIKA-4202 -- add ocr page count to metadata for PDFs (#1621)
72927ec17 is described below
commit 72927ec17681490655923dd83924215439b664b4
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri Feb 23 15:54:24 2024 -0500
TIKA-4202 -- add ocr page count to metadata for PDFs (#1621)
* TIKA-4202 -- add ocr page count to PDFs
---
.../main/java/org/apache/tika/metadata/PDF.java | 2 ++
.../org/apache/tika/parser/pdf/OCRPageCounter.java | 30 ++++++++++++++++++++++
.../java/org/apache/tika/parser/pdf/PDFParser.java | 6 +++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +-
4 files changed, 39 insertions(+), 1 deletion(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index c2baca0e8..a6c753fcd 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -209,4 +209,6 @@ public interface PDF {
Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"),
new Property[]{ TikaCoreProperties.VERSION_COUNT });
+ Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount");
+
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
new file mode 100644
index 000000000..3b382099b
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCRPageCounter.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+public class OCRPageCounter {
+
+ private int count;
+
+ public void increment() {
+ count++;
+ }
+
+ public int getCount() {
+ return count;
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index c93571daf..f21b65d4e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.pdf;
+import static org.apache.tika.metadata.PDF.OCR_PAGE_COUNT;
+
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
@@ -158,6 +160,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable {
PDFRenderingState incomingRenderingState = context.get(PDFRenderingState.class);
TikaInputStream tstream = null;
boolean shouldClose = false;
+ OCRPageCounter prevOCRCounter = context.get(OCRPageCounter.class);
+ context.set(OCRPageCounter.class, new OCRPageCounter());
try {
if (shouldSpool(localConfig)) {
if (stream instanceof TikaInputStream) {
@@ -220,6 +224,8 @@ public class PDFParser implements Parser, RenderingParser, Initializable {
metadata.set(PDF.IS_ENCRYPTED, "true");
throw new EncryptedDocumentException(e);
} finally {
+ metadata.set(OCR_PAGE_COUNT, context.get(OCRPageCounter.class).getCount());
+ context.set(OCRPageCounter.class, prevOCRCounter);
//reset the incrementalUpdateRecord even if null
context.set(IncrementalUpdateRecord.class, incomingIncrementalUpdateRecord);
PDFRenderingState currState = context.get(PDFRenderingState.class);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 6e9167f37..0269a58ef 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -238,7 +238,7 @@ public class PDFParserTest extends TikaTest {
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get("pdf:encrypted"));
//pdf:encrypted, X-Parsed-By and Content-Type
- assertEquals(4, metadata.names().length, "very little metadata should be parsed");
+ assertEquals(5, metadata.names().length, "very little metadata should be parsed");
assertEquals(0, handler.toString().length());
}