You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/29 19:15:23 UTC

[tika] 03/04: Fix for TIKA-2613 contributed by ewanmellor.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2efe3f97a7df0ac8863b225beb2deb41e99c1e90
Author: Ewan Mellor <co...@ewanmellor.org>
AuthorDate: Mon Mar 26 16:25:31 2018 -0700

    Fix for TIKA-2613 contributed by ewanmellor.
    
    Change -psm on the Tesseract command line to --psm, with two dashes.
    This matches a change in Tesseract 4.0 to remove the one-dash version.
    It has been deprecated since Nov 2016.
    
    The Tesseract cset is ee201e1f4.
    
    Also, move the config file (i.e. getOutputType in Tika's terms) so that it
    is the last parameter on the command line.  Tesseract logs an error
    message (though otherwise doesn't fail) if the config file is not the
    last thing on the command line.
---
 .../main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 6bf2ab4..f274ce1 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -468,8 +468,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
     private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
         ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
                 config.getTesseractPath() + getTesseractProg(), input.getPath(),  output.getPath(), "-l",
-                config.getLanguage(), "-psm", config.getPageSegMode(),
-                config.getOutputType().name().toLowerCase(Locale.US)
+                config.getLanguage(), "--psm", config.getPageSegMode()
         ));
         for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) {
             cmd.add("-c");
@@ -478,7 +477,8 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
         cmd.addAll(Arrays.asList(
                 "-c", "page_separator=" + config.getPageSeparator(),
                 "-c",
-                (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"
+                (config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0",
+                config.getOutputType().name().toLowerCase(Locale.US)
         ));
         ProcessBuilder pb = new ProcessBuilder(cmd);
         setEnv(config, pb);

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.