You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/04 18:19:37 UTC

[tika] 02/02: revert code that checks if language files actually exist

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit a6f87f244f434098d0ee1e622b288ee2af6ba4a5
Author: tballison <ta...@apache.org>
AuthorDate: Thu Feb 4 13:19:23 2021 -0500

    revert code that checks if language files actually exist
---
 .../apache/tika/parser/ocr/TesseractOCRConfig.java | 31 ----------------------
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  2 +-
 2 files changed, 1 insertion(+), 32 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index d1fe016..faa5ec3 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -70,9 +70,6 @@ public class TesseractOCRConfig implements Serializable {
     // Path to the 'tessdata' folder, which contains language files and config files.
     private String tessdataPath = "";
 
-    // Actual path to tessdata, if not specified by user and we have to find it ourselves
-    private static File windowsActualTessdataDir;
-
     // Language dictionary to be used.
     private String language = "eng";
 
@@ -277,8 +274,6 @@ public class TesseractOCRConfig implements Serializable {
             // First, make sure it conforms to the correct syntax
             if (!lang.matches("([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
                 invalidCodes.add(lang + " (invalid syntax)");
-            } else if (!langExists(lang)) {
-                invalidCodes.add(lang + " (not found)");
             }
         }
         if (!invalidCodes.isEmpty()) {
@@ -287,32 +282,6 @@ public class TesseractOCRConfig implements Serializable {
         this.language = language;
     }
 
-
-    /**
-     * Check if tessdata language model exists
-     */
-    private boolean langExists(String lang) {
-        if (windowsActualTessdataDir == null) {
-            // Use the same logic used in TesseractOCRParser.setEnv().  If tessdataPath is not specified then use tesseractPath, if specified
-            if (!tessdataPath.isEmpty()) {
-                windowsActualTessdataDir = new File(tessdataPath);
-            } else if (!tesseractPath.isEmpty()) {
-                windowsActualTessdataDir = new File(tesseractPath, "tessdata");
-            } else {
-                // Neither path was specified, so we'll just assume
-                // the language is good and rely on Tesseract to tell us if there's a problem
-                return true;
-            }
-        }
-
-        if (!windowsActualTessdataDir.isDirectory()) {
-            throw new RuntimeException(windowsActualTessdataDir + " is not a directory");
-        }
-        String trainedDataName = lang + ".traineddata";
-        return new File(windowsActualTessdataDir, trainedDataName).exists();
-    }
-
-
     /**
      * @see #setPageSegMode(String pageSegMode)
      */
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 64c8453..466db73 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -98,7 +98,7 @@ public class TesseractOCRParserTest extends TikaTest {
         assumeTrue("can run OCR", canRun());
 
         TesseractOCRConfig tesseractOCRConfigconfig = new TesseractOCRConfig();
-        tesseractOCRConfigconfig.setLanguage("kerplekistanese");
+        tesseractOCRConfigconfig.setLanguage("zzz");
         ParseContext parseContext = new ParseContext();
         parseContext.set(TesseractOCRConfig.class, tesseractOCRConfigconfig);