You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/16 18:14:52 UTC

[tika] branch main updated: TIKA-3298 -- parse the language string before checking validity or existence of supported langs in tesseract

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 4d5075a  TIKA-3298 -- parse the language string before checking validity or existence of supported langs in tesseract
4d5075a is described below

commit 4d5075a7a8c9c3a7144918ced20c2da581946e2f
Author: tballison <ta...@apache.org>
AuthorDate: Tue Feb 16 13:14:37 2021 -0500

    TIKA-3298 -- parse the language string before checking validity or existence of supported langs in tesseract
---
 .../apache/tika/parser/ocr/TesseractOCRConfig.java | 52 +++++++++++++++-------
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 29 +++++++++---
 2 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 79b05d6..67485f6 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.ocr;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.utils.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -136,27 +137,15 @@ public class TesseractOCRConfig implements Serializable {
      * Multiple languages may be specified, separated by plus characters.
      * e.g. "chi_tra+chi_sim+script/Arabic"
      */
-    public void setLanguage(String language) {
-        // Get rid of embedded spaces
-        language = language.replaceAll("\\s", "");
-        // Test for leading or trailing +
-        if (language.matches("\\+.*|.*\\+")) {
-            throw new IllegalArgumentException("Invalid syntax - Can't start or end with +" + language);
-        }
-        // Split on the + sign
-        final String[] langs = language.split("\\+");
-        List<String> invalidCodes = new ArrayList<>();
-        for (String lang : langs) {
-            // First, make sure it conforms to the correct syntax
-            if (!lang.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
-                invalidCodes.add(lang + " (invalid syntax)");
-            }
-        }
+    public void setLanguage(String languageString) {
+        Set<String> invalidCodes = new HashSet<>();
+        Set<String> validCodes = new HashSet<>();
+        getLangs(languageString, validCodes, invalidCodes);
         if (!invalidCodes.isEmpty()) {
             throw new IllegalArgumentException(
                     "Invalid language code(s): " + invalidCodes);
         }
-        this.language = language;
+        this.language = languageString;
         userConfigured.add("language");
     }
 
@@ -546,4 +535,33 @@ public class TesseractOCRConfig implements Serializable {
         }
         return updated;
     }
+
+    /**
+     * This takes a language string, parses it and then bins individual langs into
+     * valid or invalid based on regexes against the language codes
+     * @param language
+     * @param validLangs
+     * @param invalidLangs
+     */
+    public static void getLangs(String language, Set<String> validLangs, Set<String> invalidLangs) {
+        if (StringUtils.isBlank(language)) {
+            return;
+        }
+        // Get rid of embedded spaces
+        language = language.replaceAll("\\s", "");
+        // Test for leading or trailing +
+        if (language.matches("\\+.*|.*\\+")) {
+            throw new IllegalArgumentException("Invalid syntax - Can't start or end with +" + language);
+        }
+        // Split on the + sign
+        final String[] langs = language.split("\\+");
+        for (String lang : langs) {
+            // First, make sure it conforms to the correct syntax
+            if (!lang.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
+                invalidLangs.add(lang + " (invalid syntax)");
+            } else {
+                validLangs.add(lang);
+            }
+        }
+    }
 }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 5e70a99..f49254d 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -256,6 +256,8 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
                        TesseractOCRConfig config)
             throws IOException, SAXException, TikaException {
         warnOnFirstParse();
+        validateLangString(config.getLanguage());
+
         File tmpTxtOutput = null;
         try {
             Path input = tikaInputStream.getPath();
@@ -321,11 +323,8 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
      * @throws IOException   if an input error occurred
      */
     private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
-        if (langs.size() > 0 && !langs.contains(config.getLanguage())) {
-            throw new IllegalArgumentException("Couldn't find language " +
-                    config.getLanguage() + " upon initialization. I did find: "
-                    + langs);
-        }
+
+
         ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
                 getTesseractPath().toString() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
                 config.getLanguage(), "--psm", config.getPageSegMode()
@@ -468,11 +467,31 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
         hasImageMagick = hasImageMagick();
         if (preloadLangs) {
             preloadLangs();
+            if (! StringUtils.isBlank(defaultConfig.getLanguage())) {
+                validateLangString(defaultConfig.getLanguage());
+            }
         }
         imagePreprocessor = new ImagePreprocessor(
                 getImageMagickPath() + getImageMagickProg());
     }
 
+    private void validateLangString(String language) throws TikaConfigException {
+        Set<String> invalidlangs = new HashSet<>();
+        Set<String> validLangs = new HashSet<>();
+        TesseractOCRConfig.getLangs(language, validLangs, invalidlangs);
+        if (invalidlangs.size() > 0) {
+            throw new TikaConfigException( "Invalid language code(s): " + invalidlangs);
+        }
+        if (langs.size() > 0) {
+            for (String lang : validLangs) {
+                if (!langs.contains(lang)) {
+                    throw new TikaConfigException("tesseract does not have "
+                            + lang + " available. I see only: " + langs);
+                }
+            }
+        }
+    }
+
     @Override
     public void checkInitialization(InitializableProblemHandler problemHandler)
             throws TikaConfigException {