You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/04 18:19:37 UTC
[tika] 02/02: revert code that checks if language files actually
exist
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit a6f87f244f434098d0ee1e622b288ee2af6ba4a5
Author: tballison <ta...@apache.org>
AuthorDate: Thu Feb 4 13:19:23 2021 -0500
revert code that checks if language files actually exist
---
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 31 ----------------------
.../tika/parser/ocr/TesseractOCRParserTest.java | 2 +-
2 files changed, 1 insertion(+), 32 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index d1fe016..faa5ec3 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -70,9 +70,6 @@ public class TesseractOCRConfig implements Serializable {
// Path to the 'tessdata' folder, which contains language files and config files.
private String tessdataPath = "";
- // Actual path to tessdata, if not specified by user and we have to find it ourselves
- private static File windowsActualTessdataDir;
-
// Language dictionary to be used.
private String language = "eng";
@@ -277,8 +274,6 @@ public class TesseractOCRConfig implements Serializable {
// First, make sure it conforms to the correct syntax
if (!lang.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
invalidCodes.add(lang + " (invalid syntax)");
- } else if (!langExists(lang)) {
- invalidCodes.add(lang + " (not found)");
}
}
if (!invalidCodes.isEmpty()) {
@@ -287,32 +282,6 @@ public class TesseractOCRConfig implements Serializable {
this.language = language;
}
-
- /**
- * Check if tessdata language model exists
- */
- private boolean langExists(String lang) {
- if (windowsActualTessdataDir == null) {
- // Use the same logic used in TesseractOCRParser.setEnv(). If tessdataPath is not specified then use tesseractPath, if specified
- if (!tessdataPath.isEmpty()) {
- windowsActualTessdataDir = new File(tessdataPath);
- } else if (!tesseractPath.isEmpty()) {
- windowsActualTessdataDir = new File(tesseractPath, "tessdata");
- } else {
- // Neither path was specified, so we'll just assume
- // the language is good and rely on Tesseract to tell us if there's a problem
- return true;
- }
- }
-
- if (!windowsActualTessdataDir.isDirectory()) {
- throw new RuntimeException(windowsActualTessdataDir + " is not a directory");
- }
- String trainedDataName = lang + ".traineddata";
- return new File(windowsActualTessdataDir, trainedDataName).exists();
- }
-
-
/**
* @see #setPageSegMode(String pageSegMode)
*/
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 64c8453..466db73 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -98,7 +98,7 @@ public class TesseractOCRParserTest extends TikaTest {
assumeTrue("can run OCR", canRun());
TesseractOCRConfig tesseractOCRConfigconfig = new TesseractOCRConfig();
- tesseractOCRConfigconfig.setLanguage("kerplekistanese");
+ tesseractOCRConfigconfig.setLanguage("zzz");
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, tesseractOCRConfigconfig);