You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/11 18:46:32 UTC

[tika] branch main updated: TIKA-3297 further updates

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 85eb3ba  TIKA-3297 further updates
85eb3ba is described below

commit 85eb3ba370abcb9b21d7f9c294ebf0742f6a318b
Author: tballison <ta...@apache.org>
AuthorDate: Thu Feb 11 13:46:09 2021 -0500

    TIKA-3297 further updates
---
 .../java/org/apache/tika/config/ServiceLoader.java |  1 +
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 16 ++++++-------
 .../tika/parser/ocr/TesseractOCRParserTest.java    | 25 +++++++++++++++++++-
 .../apache/tika/parser/strings/StringsParser.java  |  9 ++++----
 .../tika/parser/strings/StringsParserTest.java     |  3 ++-
 .../apache/tika/parser/sqlite3/SQLite3Parser.java  | 27 +---------------------
 6 files changed, 40 insertions(+), 41 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
index 3056e76..6debe2e 100644
--- a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
+++ b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
@@ -374,6 +374,7 @@ public class ServiceLoader {
                         if (! shouldExclude) {
                             T instance = (T) klass.getConstructor().newInstance();
                             if (instance instanceof Initializable) {
+                                ((Initializable) instance).initialize(Collections.EMPTY_MAP);
                                 ((Initializable) instance).checkInitialization(initializableProblemHandler);
                             }
                             providers.add(instance);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index d349d33..93a8a70 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -469,17 +469,19 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
 
     @Override
     public void initialize(Map<String, Param> params) throws TikaConfigException {
-        //no-op
-    }
-
-    @Override
-    public void checkInitialization(InitializableProblemHandler problemHandler)
-            throws TikaConfigException {
         hasTesseract = hasTesseract();
         hasImageMagick = hasImageMagick();
         if (preloadLangs) {
             preloadLangs();
         }
+        imagePreprocessor = new ImagePreprocessor(
+                getImageMagickPath() + getImageMagickProg());
+    }
+
+    @Override
+    public void checkInitialization(InitializableProblemHandler problemHandler)
+            throws TikaConfigException {
+
         if (langs.size() > 0 &&
                 ! StringUtils.isBlank(defaultConfig.getLanguage())) {
             if (! langs.contains(defaultConfig.getLanguage())) {
@@ -490,8 +492,6 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
                 );
             }
         }
-        imagePreprocessor = new ImagePreprocessor(
-                getImageMagickPath() + getImageMagickProg());
     }
 
     public Set<String> getLangs() {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index bfea24f..6372964 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.ocr;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.InitializableProblemHandler;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
@@ -39,6 +40,8 @@ import org.junit.Test;
 
 import java.io.File;
 import java.io.InputStream;
+import java.util.Collection;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.regex.Matcher;
@@ -54,6 +57,7 @@ public class TesseractOCRParserTest extends TikaTest {
 
     public static boolean canRun() throws TikaConfigException {
         TesseractOCRParser p = new TesseractOCRParser();
+        p.initialize(Collections.EMPTY_MAP);
         return p.hasTesseract();
     }
 
@@ -123,8 +127,8 @@ public class TesseractOCRParserTest extends TikaTest {
 
     @Test
     public void testPositiveRotateOCR() throws Exception {
-        TesseractOCRParser p = new TesseractOCRParser();
         assumeTrue(canRun());
+        TesseractOCRParser p = new TesseractOCRParser();
         assumeTrue(p.hasImageMagick());
         TesseractOCRConfig config = new TesseractOCRConfig();
         config.setApplyRotation(true);
@@ -300,4 +304,23 @@ public class TesseractOCRParserTest extends TikaTest {
         parser.setTesseractPath("blahdeblahblah");
         assertEquals("blahdeblahblah" + File.separator, parser.getTesseractPath());
     }
+
+    @Test
+    public void testThreadJoinInLoadingLangs() throws Exception {
+        assumeTrue(canRun());
+        //make sure that the stream is fully read and
+        //we're getting the same answers on several iterations
+        Set<String> langs = getLangs();
+        assumeTrue(langs.size() > 0);
+        for (int i = 0; i < 20; i++) {
+            assertEquals(langs, getLangs());
+        }
+    }
+
+    private Set<String> getLangs() throws Exception {
+        TesseractOCRParser p = new TesseractOCRParser();
+        p.setPreloadLangs(true);
+        p.initialize(Collections.EMPTY_MAP);
+        return p.getLangs();
+    }
 }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
index ae5a69b..f60472f 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
@@ -289,15 +289,14 @@ public class StringsParser extends AbstractParser implements Initializable {
 
 	@Override
 	public void initialize(Map<String, Param> params) throws TikaConfigException {
-
+		checkForStrings();
+		fileCommandDetector = new FileCommandDetector();
+		fileCommandDetector.setFilePath(filePath);
+		fileCommandDetector.setTimeoutMs(defaultStringsConfig.getTimeoutSeconds()*1000);
 	}
 
 	@Override
 	public void checkInitialization(
 			InitializableProblemHandler problemHandler) throws TikaConfigException {
-		checkForStrings();
-		fileCommandDetector = new FileCommandDetector();
-		fileCommandDetector.setFilePath(filePath);
-		fileCommandDetector.setTimeoutMs(defaultStringsConfig.getTimeoutSeconds()*1000);
 	}
 }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
index b977d89..5893e1a 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
@@ -19,6 +19,7 @@ import static org.junit.Assume.assumeTrue;
 
 import java.io.InputStream;
 import java.util.Arrays;
+import java.util.Collections;
 
 import org.apache.tika.config.Initializable;
 import org.apache.tika.config.InitializableProblemHandler;
@@ -50,7 +51,7 @@ public class StringsParserTest {
 		StringsConfig stringsConfig = new StringsConfig();
 
 		Parser parser = new StringsParser();
-		((Initializable)parser).checkInitialization(InitializableProblemHandler.IGNORE);
+		((Initializable)parser).initialize(Collections.emptyMap());
 		ContentHandler handler = new BodyContentHandler();
 		Metadata metadata = new Metadata();
 
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
index da35f26..83000ba 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
@@ -51,8 +51,6 @@ import org.xml.sax.SAXException;
  * that has to be created.
  */
 public class SQLite3Parser extends AbstractParser implements Initializable {
-    private static volatile boolean HAS_WARNED = false;
-    private static final Object[] LOCK = new Object[0];
 
     /**
      * Serial version UID
@@ -63,14 +61,7 @@ public class SQLite3Parser extends AbstractParser implements Initializable {
 
     private static final Set<MediaType> SUPPORTED_TYPES;
     static {
-        Set<MediaType> tmp;
-        try {
-            Class.forName(SQLite3DBParser.SQLITE_CLASS_NAME);
-            tmp = Collections.singleton(MEDIA_TYPE);
-        } catch (ClassNotFoundException e) {
-            tmp = Collections.EMPTY_SET;
-        }
-        SUPPORTED_TYPES = Collections.unmodifiableSet(tmp);
+        SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
     }
     /**
      * Checks to see if class is available for org.sqlite.JDBC.
@@ -104,21 +95,5 @@ public class SQLite3Parser extends AbstractParser implements Initializable {
 
     @Override
     public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException {
-        if (SUPPORTED_TYPES.size() == 0) {
-            if (HAS_WARNED) {
-                return;
-            }
-            synchronized (LOCK) {
-                //check again while under the lock
-                if (HAS_WARNED) {
-                    return;
-                }
-                problemHandler.handleInitializableProblem("org.apache.tika.parser.SQLite3Parser",
-                        "org.xerial's sqlite-jdbc is not loaded.\n" +
-                                "Please provide the jar on your classpath to parse sqlite files.\n" +
-                                "See tika-parsers/pom.xml for the correct version.");
-                HAS_WARNED = true;
-            }
-        }
     }
 }