You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/16 18:14:52 UTC
[tika] branch main updated: TIKA-3298 -- parse the language string
before checking validity or existence of supported langs in tesseract
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 4d5075a TIKA-3298 -- parse the language string before checking validity or existence of supported langs in tesseract
4d5075a is described below
commit 4d5075a7a8c9c3a7144918ced20c2da581946e2f
Author: tballison <ta...@apache.org>
AuthorDate: Tue Feb 16 13:14:37 2021 -0500
TIKA-3298 -- parse the language string before checking validity or existence of supported langs in tesseract
---
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 52 +++++++++++++++-------
.../apache/tika/parser/ocr/TesseractOCRParser.java | 29 +++++++++---
2 files changed, 59 insertions(+), 22 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index 79b05d6..67485f6 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser.ocr;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.utils.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -136,27 +137,15 @@ public class TesseractOCRConfig implements Serializable {
* Multiple languages may be specified, separated by plus characters.
* e.g. "chi_tra+chi_sim+script/Arabic"
*/
- public void setLanguage(String language) {
- // Get rid of embedded spaces
- language = language.replaceAll("\\s", "");
- // Test for leading or trailing +
- if (language.matches("\\+.*|.*\\+")) {
- throw new IllegalArgumentException("Invalid syntax - Can't start or end with +" + language);
- }
- // Split on the + sign
- final String[] langs = language.split("\\+");
- List<String> invalidCodes = new ArrayList<>();
- for (String lang : langs) {
- // First, make sure it conforms to the correct syntax
- if (!lang.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
- invalidCodes.add(lang + " (invalid syntax)");
- }
- }
+ public void setLanguage(String languageString) {
+ Set<String> invalidCodes = new HashSet<>();
+ Set<String> validCodes = new HashSet<>();
+ getLangs(languageString, validCodes, invalidCodes);
if (!invalidCodes.isEmpty()) {
throw new IllegalArgumentException(
"Invalid language code(s): " + invalidCodes);
}
- this.language = language;
+ this.language = languageString;
userConfigured.add("language");
}
@@ -546,4 +535,33 @@ public class TesseractOCRConfig implements Serializable {
}
return updated;
}
+
+ /**
+ * This takes a language string, parses it and then bins individual langs into
+ * valid or invalid based on regexes against the language codes
+ * @param language
+ * @param validLangs
+ * @param invalidLangs
+ */
+ public static void getLangs(String language, Set<String> validLangs, Set<String> invalidLangs) {
+ if (StringUtils.isBlank(language)) {
+ return;
+ }
+ // Get rid of embedded spaces
+ language = language.replaceAll("\\s", "");
+ // Test for leading or trailing +
+ if (language.matches("\\+.*|.*\\+")) {
+ throw new IllegalArgumentException("Invalid syntax - Can't start or end with +" + language);
+ }
+ // Split on the + sign
+ final String[] langs = language.split("\\+");
+ for (String lang : langs) {
+ // First, make sure it conforms to the correct syntax
+ if (!lang.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
+ invalidLangs.add(lang + " (invalid syntax)");
+ } else {
+ validLangs.add(lang);
+ }
+ }
+ }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 5e70a99..f49254d 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -256,6 +256,8 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
TesseractOCRConfig config)
throws IOException, SAXException, TikaException {
warnOnFirstParse();
+ validateLangString(config.getLanguage());
+
File tmpTxtOutput = null;
try {
Path input = tikaInputStream.getPath();
@@ -321,11 +323,8 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
* @throws IOException if an input error occurred
*/
private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
- if (langs.size() > 0 && !langs.contains(config.getLanguage())) {
- throw new IllegalArgumentException("Couldn't find language " +
- config.getLanguage() + " upon initialization. I did find: "
- + langs);
- }
+
+
ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
getTesseractPath().toString() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
config.getLanguage(), "--psm", config.getPageSegMode()
@@ -468,11 +467,31 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
hasImageMagick = hasImageMagick();
if (preloadLangs) {
preloadLangs();
+ if (! StringUtils.isBlank(defaultConfig.getLanguage())) {
+ validateLangString(defaultConfig.getLanguage());
+ }
}
imagePreprocessor = new ImagePreprocessor(
getImageMagickPath() + getImageMagickProg());
}
+ private void validateLangString(String language) throws TikaConfigException {
+ Set<String> invalidlangs = new HashSet<>();
+ Set<String> validLangs = new HashSet<>();
+ TesseractOCRConfig.getLangs(language, validLangs, invalidlangs);
+ if (invalidlangs.size() > 0) {
+ throw new TikaConfigException( "Invalid language code(s): " + invalidlangs);
+ }
+ if (langs.size() > 0) {
+ for (String lang : validLangs) {
+ if (!langs.contains(lang)) {
+ throw new TikaConfigException("tesseract does not have "
+ + lang + " available. I see only: " + langs);
+ }
+ }
+ }
+ }
+
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
throws TikaConfigException {