You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/11 18:46:32 UTC
[tika] branch main updated: TIKA-3297 further updates
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 85eb3ba TIKA-3297 further updates
85eb3ba is described below
commit 85eb3ba370abcb9b21d7f9c294ebf0742f6a318b
Author: tballison <ta...@apache.org>
AuthorDate: Thu Feb 11 13:46:09 2021 -0500
TIKA-3297 further updates
---
.../java/org/apache/tika/config/ServiceLoader.java | 1 +
.../apache/tika/parser/ocr/TesseractOCRParser.java | 16 ++++++-------
.../tika/parser/ocr/TesseractOCRParserTest.java | 25 +++++++++++++++++++-
.../apache/tika/parser/strings/StringsParser.java | 9 ++++----
.../tika/parser/strings/StringsParserTest.java | 3 ++-
.../apache/tika/parser/sqlite3/SQLite3Parser.java | 27 +---------------------
6 files changed, 40 insertions(+), 41 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
index 3056e76..6debe2e 100644
--- a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
+++ b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
@@ -374,6 +374,7 @@ public class ServiceLoader {
if (! shouldExclude) {
T instance = (T) klass.getConstructor().newInstance();
if (instance instanceof Initializable) {
+ ((Initializable) instance).initialize(Collections.EMPTY_MAP);
((Initializable) instance).checkInitialization(initializableProblemHandler);
}
providers.add(instance);
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index d349d33..93a8a70 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -469,17 +469,19 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
- //no-op
- }
-
- @Override
- public void checkInitialization(InitializableProblemHandler problemHandler)
- throws TikaConfigException {
hasTesseract = hasTesseract();
hasImageMagick = hasImageMagick();
if (preloadLangs) {
preloadLangs();
}
+ imagePreprocessor = new ImagePreprocessor(
+ getImageMagickPath() + getImageMagickProg());
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+
if (langs.size() > 0 &&
! StringUtils.isBlank(defaultConfig.getLanguage())) {
if (! langs.contains(defaultConfig.getLanguage())) {
@@ -490,8 +492,6 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
);
}
}
- imagePreprocessor = new ImagePreprocessor(
- getImageMagickPath() + getImageMagickProg());
}
public Set<String> getLangs() {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index bfea24f..6372964 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser.ocr;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
@@ -39,6 +40,8 @@ import org.junit.Test;
import java.io.File;
import java.io.InputStream;
+import java.util.Collection;
+import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
@@ -54,6 +57,7 @@ public class TesseractOCRParserTest extends TikaTest {
public static boolean canRun() throws TikaConfigException {
TesseractOCRParser p = new TesseractOCRParser();
+ p.initialize(Collections.EMPTY_MAP);
return p.hasTesseract();
}
@@ -123,8 +127,8 @@ public class TesseractOCRParserTest extends TikaTest {
@Test
public void testPositiveRotateOCR() throws Exception {
- TesseractOCRParser p = new TesseractOCRParser();
assumeTrue(canRun());
+ TesseractOCRParser p = new TesseractOCRParser();
assumeTrue(p.hasImageMagick());
TesseractOCRConfig config = new TesseractOCRConfig();
config.setApplyRotation(true);
@@ -300,4 +304,23 @@ public class TesseractOCRParserTest extends TikaTest {
parser.setTesseractPath("blahdeblahblah");
assertEquals("blahdeblahblah" + File.separator, parser.getTesseractPath());
}
+
+ @Test
+ public void testThreadJoinInLoadingLangs() throws Exception {
+ assumeTrue(canRun());
+ //make sure that the stream is fully read and
+ //we're getting the same answers on several iterations
+ Set<String> langs = getLangs();
+ assumeTrue(langs.size() > 0);
+ for (int i = 0; i < 20; i++) {
+ assertEquals(langs, getLangs());
+ }
+ }
+
+ private Set<String> getLangs() throws Exception {
+ TesseractOCRParser p = new TesseractOCRParser();
+ p.setPreloadLangs(true);
+ p.initialize(Collections.EMPTY_MAP);
+ return p.getLangs();
+ }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
index ae5a69b..f60472f 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
@@ -289,15 +289,14 @@ public class StringsParser extends AbstractParser implements Initializable {
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
-
+ checkForStrings();
+ fileCommandDetector = new FileCommandDetector();
+ fileCommandDetector.setFilePath(filePath);
+ fileCommandDetector.setTimeoutMs(defaultStringsConfig.getTimeoutSeconds()*1000);
}
@Override
public void checkInitialization(
InitializableProblemHandler problemHandler) throws TikaConfigException {
- checkForStrings();
- fileCommandDetector = new FileCommandDetector();
- fileCommandDetector.setFilePath(filePath);
- fileCommandDetector.setTimeoutMs(defaultStringsConfig.getTimeoutSeconds()*1000);
}
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
index b977d89..5893e1a 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java
@@ -19,6 +19,7 @@ import static org.junit.Assume.assumeTrue;
import java.io.InputStream;
import java.util.Arrays;
+import java.util.Collections;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -50,7 +51,7 @@ public class StringsParserTest {
StringsConfig stringsConfig = new StringsConfig();
Parser parser = new StringsParser();
- ((Initializable)parser).checkInitialization(InitializableProblemHandler.IGNORE);
+ ((Initializable)parser).initialize(Collections.emptyMap());
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
index da35f26..83000ba 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-sqlite3-module/src/main/java/org/apache/tika/parser/sqlite3/SQLite3Parser.java
@@ -51,8 +51,6 @@ import org.xml.sax.SAXException;
* that has to be created.
*/
public class SQLite3Parser extends AbstractParser implements Initializable {
- private static volatile boolean HAS_WARNED = false;
- private static final Object[] LOCK = new Object[0];
/**
* Serial version UID
@@ -63,14 +61,7 @@ public class SQLite3Parser extends AbstractParser implements Initializable {
private static final Set<MediaType> SUPPORTED_TYPES;
static {
- Set<MediaType> tmp;
- try {
- Class.forName(SQLite3DBParser.SQLITE_CLASS_NAME);
- tmp = Collections.singleton(MEDIA_TYPE);
- } catch (ClassNotFoundException e) {
- tmp = Collections.EMPTY_SET;
- }
- SUPPORTED_TYPES = Collections.unmodifiableSet(tmp);
+ SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
}
/**
* Checks to see if class is available for org.sqlite.JDBC.
@@ -104,21 +95,5 @@ public class SQLite3Parser extends AbstractParser implements Initializable {
@Override
public void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException {
- if (SUPPORTED_TYPES.size() == 0) {
- if (HAS_WARNED) {
- return;
- }
- synchronized (LOCK) {
- //check again while under the lock
- if (HAS_WARNED) {
- return;
- }
- problemHandler.handleInitializableProblem("org.apache.tika.parser.SQLite3Parser",
- "org.xerial's sqlite-jdbc is not loaded.\n" +
- "Please provide the jar on your classpath to parse sqlite files.\n" +
- "See tika-parsers/pom.xml for the correct version.");
- HAS_WARNED = true;
- }
- }
}
}