You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/29 19:15:21 UTC

[tika] 01/04: Fix for TIKA-2582 contributed by ewanmellor.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d1526d053f91497ac7bcd4509f1555f4347377d6
Author: Ewan Mellor <co...@ewanmellor.org>
AuthorDate: Wed Feb 21 13:09:53 2018 -0800

    Fix for TIKA-2582 contributed by ewanmellor.
    
    Tesseract 4.0 includes a change to use form feed characters to separate
    pages by default in its text output. Previous versions used no separator
    unless you specified the include_page_breaks option.
    
    This confuses any parser that is not expecting the FF.
    ODFParserTest.testOO2Metadata fails, because it is expecting the output of
    a blank image to be the empty string, but now the FF is there.
    
    I haven't seen any other failures, but I expect that user code will now see
    either FF or U+FFFD where they are not expecting it (SafeContentHandler
    replaces the FF with U+FFFD when converting to text to XML).
    
    Fix this by setting Tesseract's page_separator option to the empty string.
    This will preserve the no-page-breaks behavior with both Tesseract 3.x and
    4.0.
    
    Also, add an option TesseractOCRConfig.pageSeparator so that user code can
    request the FF or any other separator, if they want it.
---
 .../apache/tika/parser/ocr/TesseractOCRConfig.java | 22 ++++++++++++++++++++++
 .../apache/tika/parser/ocr/TesseractOCRParser.java |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index afe0a21..4139cd2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -91,6 +91,9 @@ public class TesseractOCRConfig implements Serializable {
     // factor by which image is to be scaled.
     private int resize = 900;
 
+    // See setPageSeparator.
+    private String pageSeparator = "";
+
     // whether or not to preserve interword spacing
     private boolean preserveInterwordSpacing = false;
 
@@ -256,6 +259,25 @@ public class TesseractOCRConfig implements Serializable {
     }
 
     /**
+     * @see #setPageSeparator(String pageSeparator)
+     */
+    public String getPageSeparator() {
+        return pageSeparator;
+    }
+
+    /**
+     * The page separator to use in plain text output.  This corresponds to Tesseract's page_separator config option.
+     * The default here is the empty string (i.e. no page separators).  Note that this is also the default in
+     * Tesseract 3.x, but in Tesseract 4.0 the default is to use the form feed control character.  We are overriding
+     * Tesseract 4.0's default here.
+     *
+     * @param pageSeparator
+     */
+    public void setPageSeparator(String pageSeparator) {
+        this.pageSeparator = pageSeparator;
+    }
+
+    /**
      * Whether or not to maintain interword spacing.  Default is <code>false</code>.
      *
      * @param preserveInterwordSpacing
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 08847fd..3e15c44 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -468,6 +468,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
         String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
                 config.getLanguage(), "-psm", config.getPageSegMode(),
                 config.getOutputType().name().toLowerCase(Locale.US),
+                "-c", "page_separator=" + config.getPageSeparator(),
                 "-c",
                 (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
         ProcessBuilder pb = new ProcessBuilder(cmd);

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.