You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/12/02 19:38:03 UTC
[tika] branch branch_1x updated: TIKA-3002 -- fix bug in OCR AUTO
mode
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new f67a834 TIKA-3002 -- fix bug in OCR AUTO mode
f67a834 is described below
commit f67a83444036d4fb5b23e9000f06434bfb58eefc
Author: tallison <ta...@apache.org>
AuthorDate: Mon Dec 2 11:03:02 2019 -0500
TIKA-3002 -- fix bug in OCR AUTO mode
---
.../org/apache/tika/parser/pdf/AbstractPDF2XHTML.java | 5 +++--
.../java/org/apache/tika/parser/pdf/PDFParserTest.java | 15 +++++++++++++++
2 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 2f41eec..ea3b173 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -370,8 +370,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
unmappedUnicodeCharsPerPage);
- totalCharsPerPage = 0;
- unmappedUnicodeCharsPerPage = 0;
try {
for (PDAnnotation annotation : page.getAnnotations()) {
@@ -458,6 +456,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
throw new IOExceptionWithCause("Unable to end a page", e);
} catch (IOException e) {
handleCatchableIOE(e);
+ } finally {
+ totalCharsPerPage = 0;
+ unmappedUnicodeCharsPerPage = 0;
}
if (config.getExtractFontNames()) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 31dc2a6..8709451 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1301,6 +1301,21 @@ public class PDFParserTest extends TikaTest {
}
@Test
+ public void testOCRAutoMode() throws Exception {
+ assumeTrue("can run OCR", canRunOCR());
+ PDFParserConfig config = new PDFParserConfig();
+ config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO);
+ ParseContext context = new ParseContext();
+ context.set(PDFParserConfig.class, config);
+ XMLResult xmlResult = getXML("testOCR.pdf", context);
+ assertContains("Happy New Year", xmlResult.xml);
+
+ config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ String txt = getText("testOCR.pdf", new Metadata(), context);
+ assertEquals("", txt.trim());
+ }
+
+ @Test
public void testTesseractInitializationWorks() throws Exception {
//TIKA-2970 -- make sure that configurations set on the TesseractOCRParser
//make it through to when the TesseractOCRParser is called via