You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/16 20:08:22 UTC

[tika] branch main updated: TIKA-3298 -- allow empty string

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 0fac4f8  TIKA-3298 -- allow empty string
0fac4f8 is described below

commit 0fac4f8efdc9964db3780b6fcfc1010483554cb7
Author: tballison <ta...@apache.org>
AuthorDate: Tue Feb 16 15:08:11 2021 -0500

    TIKA-3298 -- allow empty string
---
 .../main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java | 9 ++++++---
 .../java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java  | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index f49254d..3705f99 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -324,11 +324,14 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
      */
     private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
 
-
         ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
-                getTesseractPath().toString() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
-                config.getLanguage(), "--psm", config.getPageSegMode()
+                getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(),
+                "--psm", config.getPageSegMode()
         ));
+        if (! StringUtils.isBlank(config.getLanguage())) {
+            cmd.add("-l");
+            cmd.add(config.getLanguage());
+        }
         for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) {
             cmd.add("-c");
             cmd.add(entry.getKey() + "=" + entry.getValue());
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index 02fc149..3aa87d2 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -105,7 +105,8 @@ public class TesseractOCRConfigTest extends TikaTest {
     @Test
     public void testValidateInvalidLanguage() {
         List<String> invalidLanguages = Arrays.asList(
-                "", "+", "en", "en+", "eng+fra+", "Arabic", "/script/Arabic", "rm -rf *");
+                //"", allow empty string
+                "+", "en", "en+", "eng+fra+", "Arabic", "/script/Arabic", "rm -rf *");
 
         TesseractOCRConfig config = new TesseractOCRConfig();