You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/12/02 19:38:03 UTC

[tika] branch branch_1x updated: TIKA-3002 -- fix bug in OCR AUTO mode

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new f67a834  TIKA-3002 -- fix bug in OCR AUTO mode
f67a834 is described below

commit f67a83444036d4fb5b23e9000f06434bfb58eefc
Author: tallison <ta...@apache.org>
AuthorDate: Mon Dec 2 11:03:02 2019 -0500

    TIKA-3002 -- fix bug in OCR AUTO mode
---
 .../org/apache/tika/parser/pdf/AbstractPDF2XHTML.java     |  5 +++--
 .../java/org/apache/tika/parser/pdf/PDFParserTest.java    | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 2f41eec..ea3b173 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -370,8 +370,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         metadata.add(PDF.CHARACTERS_PER_PAGE, totalCharsPerPage);
         metadata.add(PDF.UNMAPPED_UNICODE_CHARS_PER_PAGE,
                 unmappedUnicodeCharsPerPage);
-        totalCharsPerPage = 0;
-        unmappedUnicodeCharsPerPage = 0;
 
         try {
             for (PDAnnotation annotation : page.getAnnotations()) {
@@ -458,6 +456,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             throw new IOExceptionWithCause("Unable to end a page", e);
         } catch (IOException e) {
             handleCatchableIOE(e);
+        } finally {
+            totalCharsPerPage = 0;
+            unmappedUnicodeCharsPerPage = 0;
         }
 
         if (config.getExtractFontNames()) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 31dc2a6..8709451 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1301,6 +1301,21 @@ public class PDFParserTest extends TikaTest {
     }
 
     @Test
+    public void testOCRAutoMode() throws Exception {
+        assumeTrue("can run OCR", canRunOCR());
+        PDFParserConfig config = new PDFParserConfig();
+        config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.AUTO);
+        ParseContext context = new ParseContext();
+        context.set(PDFParserConfig.class, config);
+        XMLResult xmlResult = getXML("testOCR.pdf", context);
+        assertContains("Happy New Year", xmlResult.xml);
+
+        config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+        String txt = getText("testOCR.pdf", new Metadata(), context);
+        assertEquals("", txt.trim());
+    }
+
+    @Test
     public void testTesseractInitializationWorks() throws Exception {
         //TIKA-2970 -- make sure that configurations set on the TesseractOCRParser
         //make it through to when the TesseractOCRParser is called via