You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/09/04 19:40:36 UTC
[tika] branch main updated: TIKA-3190 -- move tika-eval's language
detector into its own standalone module
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new a9f8271 TIKA-3190 -- move tika-eval's language detector into its own standalone module
a9f8271 is described below
commit a9f8271659c2137c1210eff4e5daf04d0aafd68a
Author: tallison <ta...@apache.org>
AuthorDate: Fri Sep 4 15:40:14 2020 -0400
TIKA-3190 -- move tika-eval's language detector into its own standalone module
---
pom.xml | 1 -
tika-eval/.gitignore | 1 -
tika-eval/pom.xml | 11 +-
.../org/apache/tika/eval/AbstractProfiler.java | 8 +-
.../tika/eval/batch/EvalConsumersBuilder.java | 12 --
.../java/org/apache/tika/eval/langid/Language.java | 35 ----
.../apache/tika/eval/langid/LanguageIDWrapper.java | 115 ++---------
.../tika/eval/metadata/TikaEvalMetadataFilter.java | 6 +-
.../apache/tika/eval/textstats/CommonTokens.java | 7 +-
.../eval/textstats/CommonTokensBhattacharyya.java | 4 +-
.../tika/eval/textstats/CommonTokensCosine.java | 4 +-
.../tika/eval/textstats/CommonTokensHellinger.java | 4 +-
.../tika/eval/textstats/CommonTokensKLDNormed.java | 4 +-
.../eval/textstats/CommonTokensKLDivergence.java | 4 +-
.../textstats/CompositeTextStatsCalculator.java | 9 +-
.../textstats/LanguageAwareTokenCountStats.java | 4 +-
.../main/resources/tika-eval-comparison-config.xml | 3 +-
.../org/apache/tika/eval/SimpleComparerTest.java | 1 -
.../org/apache/tika/eval/langid/LangIdTest.java | 5 +-
.../apache/tika/eval/textstats/TextStatsTest.java | 10 +-
.../org/apache/tika/eval/util/LanguageIdTest.java | 5 +-
tika-langdetect/overview.html | 24 +++
tika-langdetect/pom.xml | 6 +
tika-langdetect/tika-langdetect-opennlp/.gitignore | 1 +
tika-langdetect/tika-langdetect-opennlp/pom.xml | 36 ++++
.../tika/langdetect/opennlp/OpenNLPDetector.java | 223 +++++++++++++++++++++
.../opennlp}/ProbingLanguageDetector.java | 24 ++-
.../opennlp_langdetect_model_20190626.bin | Bin
.../langdetect/opennlp/OpenNLPDetectorTest.java | 68 +++++++
29 files changed, 427 insertions(+), 208 deletions(-)
diff --git a/pom.xml b/pom.xml
index 6cf7070..bf0229a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -40,7 +40,6 @@
<module>tika-parser-modules</module>
<module>tika-parsers</module>
<module>tika-parsers-extended</module>
-
<!-- <module>tika-bundle</module> turn this off for now -->
<module>tika-xmp</module>
<module>tika-serialization</module>
diff --git a/tika-eval/.gitignore b/tika-eval/.gitignore
deleted file mode 100644
index 749cae7..0000000
--- a/tika-eval/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-!model_20190626.bin
\ No newline at end of file
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index cbb356c..c15e0da 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -50,7 +50,11 @@
<artifactId>tika-serialization</artifactId>
<version>${project.version}</version>
</dependency>
-
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-langdetect-opennlp</artifactId>
+ <version>${project.version}</version>
+ </dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
@@ -96,11 +100,6 @@
<version>${lucene.version}</version>
</dependency>
<dependency>
- <groupId>org.apache.opennlp</groupId>
- <artifactId>opennlp-tools</artifactId>
- <version>${opennlp.version}</version>
- </dependency>
- <dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>${commons.lang3.version}</version>
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index b54b242..ed13e45 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -48,7 +48,6 @@ import org.apache.tika.eval.db.Cols;
import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.eval.io.ExtractReaderException;
import org.apache.tika.eval.io.IDBWriter;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.langid.LanguageIDWrapper;
import org.apache.tika.eval.textstats.BasicTokenCountStatsCalculator;
import org.apache.tika.eval.textstats.CommonTokens;
@@ -68,6 +67,7 @@ import org.apache.tika.eval.util.ContentTagParser;
import org.apache.tika.eval.util.ContentTags;
import org.apache.tika.eval.util.EvalExceptionUtils;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -586,17 +586,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
void langid(Map<Class, Object> stats, Map<Cols, String> data) {
- List<Language> probabilities = (List<Language>) stats.get(LanguageIDWrapper.class);
+ List<LanguageResult> probabilities = (List<LanguageResult>) stats.get(LanguageIDWrapper.class);
if (probabilities.size() > 0) {
data.put(Cols.LANG_ID_1, probabilities.get(0).getLanguage());
data.put(Cols.LANG_ID_PROB_1,
- Double.toString(probabilities.get(0).getConfidence()));
+ Double.toString(probabilities.get(0).getRawScore()));
}
if (probabilities.size() > 1) {
data.put(Cols.LANG_ID_2, probabilities.get(1).getLanguage());
data.put(Cols.LANG_ID_PROB_2,
- Double.toString(probabilities.get(1).getConfidence()));
+ Double.toString(probabilities.get(1).getRawScore()));
}
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
index 61977fa..62cf348 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
@@ -57,18 +57,6 @@ public class EvalConsumersBuilder extends AbstractConsumersBuilder {
String jdbcConnectionString = localAttrs.get("jdbc");
- Path langModelDir = getPath(localAttrs, "langModelDir");
-
- try {
- if (langModelDir == null) {
- LanguageIDWrapper.loadBuiltInModels();
- } else {
- LanguageIDWrapper.loadModels(langModelDir);
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
Path commonTokens = getPath(localAttrs, "commonTokens");
String defaultLangCode = localAttrs.get("defaultLangCode");
if (defaultLangCode == null) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/langid/Language.java b/tika-eval/src/main/java/org/apache/tika/eval/langid/Language.java
deleted file mode 100644
index 6f0cc85..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/langid/Language.java
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.langid;
-
-public class Language {
- private final String language;
- private final double confidence;
-
- public Language(String language, double confidence) {
- this.language = language;
- this.confidence = confidence;
- }
-
- public String getLanguage() {
- return language;
- }
-
- public double getConfidence() {
- return confidence;
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java b/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java
index 50f928e..4538a05 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java
@@ -16,124 +16,33 @@
*/
package org.apache.tika.eval.langid;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Path;
-import java.util.ArrayList;
import java.util.List;
-import java.util.regex.Pattern;
-import opennlp.tools.langdetect.LanguageDetectorModel;
-import opennlp.tools.util.normalizer.CharSequenceNormalizer;
-import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
import org.apache.tika.eval.textstats.StringStatsCalculator;
+import org.apache.tika.langdetect.opennlp.OpenNLPDetector;
+import org.apache.tika.language.detect.LanguageResult;
-/**
- * The most efficient way to call this in a multithreaded environment
- * is to call {@link LanguageIDWrapper#loadBuiltInModels()} before
- * instantiating the
- */
-public class LanguageIDWrapper implements StringStatsCalculator<List<Language>> {
-
- static LanguageDetectorModel LANG_MODEL;
+public class LanguageIDWrapper implements StringStatsCalculator<List<LanguageResult>> {
static int MAX_TEXT_LENGTH = 50000;
- public static synchronized void loadBuiltInModels() throws IOException {
- try (InputStream is = LanguageIDWrapper.class.getResourceAsStream(
- "/opennlp/model_20190626.bin"
- )) {
- LANG_MODEL = new LanguageDetectorModel(is);
- }
- }
-
- public static void loadModels(Path path) throws IOException {
- LANG_MODEL = new LanguageDetectorModel(path.toFile());
- }
-
- private static CharSequenceNormalizer[] getNormalizers() {
- return new CharSequenceNormalizer[]{
- TikaUrlCharSequenceNormalizer.getInstance(),
- AlphaIdeographSequenceNormalizer.getInstance(),
- EmojiCharSequenceNormalizer.getInstance(),
- TwitterCharSequenceNormalizer.getInstance(),
- NumberCharSequenceNormalizer.getInstance(),
- ShrinkCharSequenceNormalizer.getInstance()
- };
- }
-
- private final opennlp.tools.langdetect.LanguageDetector detector;
public LanguageIDWrapper() {
- if (LANG_MODEL == null) {
- try {
- loadBuiltInModels();
- } catch (IOException e) {
- throw new RuntimeException("couldn't load built in lang models", e);
- }
- }
- detector = new ProbingLanguageDetector(LANG_MODEL, getNormalizers());
}
- public List<Language> getProbabilities(String s) {
- opennlp.tools.langdetect.Language[] detected = detector.predictLanguages(s);
- List<Language> ret = new ArrayList<>();
- for (int i = 0; i < detected.length; i++) {
- ret.add(new Language(detected[i].getLang(), detected[i].getConfidence()));
- }
- return ret;
- }
-
- public String[] getSupportedLanguages() {
- return detector.getSupportedLanguages();
- }
-
- public static void setMaxTextLength(int maxTextLength) {
- MAX_TEXT_LENGTH = maxTextLength;
+ public static void setMaxTextLength(int maxContentLengthForLangId) {
+ MAX_TEXT_LENGTH = maxContentLengthForLangId;
}
@Override
- public List<Language> calculate(String txt) {
- return getProbabilities(txt);
- }
-
- private static class TikaUrlCharSequenceNormalizer implements CharSequenceNormalizer {
- //use this custom copy/paste of opennlo to avoid long, long hang with mail_regex
- //TIKA-2777
- private static final Pattern URL_REGEX = Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]{10,10000}");
- private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]{1,100}@[-_0-9A-Za-z]{1,100}[-_.0-9A-Za-z]{1,100}");
- private static final TikaUrlCharSequenceNormalizer INSTANCE = new TikaUrlCharSequenceNormalizer();
-
- public static TikaUrlCharSequenceNormalizer getInstance() {
- return INSTANCE;
- }
-
- private TikaUrlCharSequenceNormalizer() {
- }
-
- @Override
- public CharSequence normalize(CharSequence charSequence) {
- String modified = URL_REGEX.matcher(charSequence).replaceAll(" ");
- return MAIL_REGEX.matcher(modified).replaceAll(" ");
- }
+ public List<LanguageResult> calculate(String txt) {
+ OpenNLPDetector detector = new OpenNLPDetector();
+ detector.setMaxLength(MAX_TEXT_LENGTH);
+ detector.addText(txt);
+ return detector.detectAll();
}
- private static class AlphaIdeographSequenceNormalizer implements CharSequenceNormalizer {
- private static final Pattern REGEX = Pattern.compile("[^\\p{IsAlphabetic}\\p{IsIdeographic}]+");
- private static final AlphaIdeographSequenceNormalizer INSTANCE = new AlphaIdeographSequenceNormalizer();
-
- public static AlphaIdeographSequenceNormalizer getInstance() {
- return INSTANCE;
- }
- private AlphaIdeographSequenceNormalizer() {
- }
-
- @Override
- public CharSequence normalize(CharSequence charSequence) {
- return REGEX.matcher(charSequence).replaceAll(" ");
- }
+ public String[] getSupportedLanguages() {
+ return new OpenNLPDetector().getSupportedLanguages();
}
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
index 2c69801..017340f 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
@@ -17,7 +17,6 @@
package org.apache.tika.eval.metadata;
import org.apache.commons.lang3.StringUtils;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.langid.LanguageIDWrapper;
import org.apache.tika.eval.textstats.BasicTokenCountStatsCalculator;
import org.apache.tika.eval.textstats.CommonTokens;
@@ -26,6 +25,7 @@ import org.apache.tika.eval.textstats.TextStatsCalculator;
import org.apache.tika.eval.tokens.CommonTokenResult;
import org.apache.tika.eval.tokens.TokenCounts;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -94,10 +94,10 @@ public class TikaEvalMetadataFilter implements MetadataFilter {
}
//languages
- List<Language> probabilities = (List<Language>) results.get(LanguageIDWrapper.class);
+ List<LanguageResult> probabilities = (List<LanguageResult>) results.get(LanguageIDWrapper.class);
if (probabilities.size() > 0) {
metadata.set(LANGUAGE, probabilities.get(0).getLanguage());
- metadata.set(LANGUAGE_CONFIDENCE, probabilities.get(0).getConfidence());
+ metadata.set(LANGUAGE_CONFIDENCE, probabilities.get(0).getRawScore());
}
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokens.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokens.java
index cb1dee1..73bffae 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokens.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokens.java
@@ -22,12 +22,12 @@ import java.util.Set;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.Pair;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.tokens.AlphaIdeographFilterFactory;
import org.apache.tika.eval.tokens.CommonTokenCountManager;
import org.apache.tika.eval.tokens.CommonTokenResult;
import org.apache.tika.eval.tokens.LangModel;
import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.language.detect.LanguageResult;
public class CommonTokens implements LanguageAwareTokenCountStats<CommonTokenResult> {
@@ -41,8 +41,9 @@ public class CommonTokens implements LanguageAwareTokenCountStats<CommonTokenRes
}
@Override
- public CommonTokenResult calculate(List<Language> languages, TokenCounts tokenCounts) {
- Pair<String, LangModel> pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage());
+ public CommonTokenResult calculate(List<LanguageResult> languages, TokenCounts tokenCounts) {
+ Pair<String, LangModel> pair = commonTokenCountManager.getLangTokens(languages.get(0)
+ .getLanguage());
String actualLangCode = pair.getKey();
Set<String> commonTokens = pair.getValue().getTokens();
int numUniqueCommonTokens = 0;
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensBhattacharyya.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensBhattacharyya.java
index e64374f..4f169c5 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensBhattacharyya.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensBhattacharyya.java
@@ -22,10 +22,10 @@ import java.util.Map;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.util.FastMath;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.tokens.CommonTokenCountManager;
import org.apache.tika.eval.tokens.LangModel;
import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.language.detect.LanguageResult;
public class CommonTokensBhattacharyya implements LanguageAwareTokenCountStats<Double> {
@@ -36,7 +36,7 @@ public class CommonTokensBhattacharyya implements LanguageAwareTokenCountStats<D
}
@Override
- public Double calculate(List<Language> languages, TokenCounts tokenCounts) {
+ public Double calculate(List<LanguageResult> languages, TokenCounts tokenCounts) {
Pair<String, LangModel> pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage());
LangModel model = pair.getValue();
double sum = 0.0;
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensCosine.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensCosine.java
index daa59b5..fb9a32a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensCosine.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensCosine.java
@@ -23,10 +23,10 @@ import java.util.Map;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.util.FastMath;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.tokens.CommonTokenCountManager;
import org.apache.tika.eval.tokens.LangModel;
import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.language.detect.LanguageResult;
public class CommonTokensCosine implements LanguageAwareTokenCountStats<Double> {
@@ -37,7 +37,7 @@ public class CommonTokensCosine implements LanguageAwareTokenCountStats<Double>
}
@Override
- public Double calculate(List<Language> languages, TokenCounts tokenCounts) {
+ public Double calculate(List<LanguageResult> languages, TokenCounts tokenCounts) {
Pair<String, LangModel> pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage());
LangModel model = pair.getValue();
double kl = 0.0;
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensHellinger.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensHellinger.java
index 0b11f8a..61fdff8 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensHellinger.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensHellinger.java
@@ -22,10 +22,10 @@ import java.util.Map;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.util.FastMath;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.tokens.CommonTokenCountManager;
import org.apache.tika.eval.tokens.LangModel;
import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.language.detect.LanguageResult;
public class CommonTokensHellinger implements LanguageAwareTokenCountStats<Double> {
@@ -36,7 +36,7 @@ public class CommonTokensHellinger implements LanguageAwareTokenCountStats<Doubl
}
@Override
- public Double calculate(List<Language> languages, TokenCounts tokenCounts) {
+ public Double calculate(List<LanguageResult> languages, TokenCounts tokenCounts) {
Pair<String, LangModel> pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage());
LangModel model = pair.getValue();
double sum = 0.0;
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensKLDNormed.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensKLDNormed.java
index 2bd8974..1abaa52 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensKLDNormed.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensKLDNormed.java
@@ -22,10 +22,10 @@ import java.util.Map;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.util.FastMath;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.tokens.CommonTokenCountManager;
import org.apache.tika.eval.tokens.LangModel;
import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.language.detect.LanguageResult;
public class CommonTokensKLDNormed implements LanguageAwareTokenCountStats<Double> {
@@ -36,7 +36,7 @@ public class CommonTokensKLDNormed implements LanguageAwareTokenCountStats<Doubl
}
@Override
- public Double calculate(List<Language> languages, TokenCounts tokenCounts) {
+ public Double calculate(List<LanguageResult> languages, TokenCounts tokenCounts) {
Pair<String, LangModel> pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage());
LangModel model = pair.getValue();
double kl = 0.0;
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensKLDivergence.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensKLDivergence.java
index eb6b271..9331e6d 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensKLDivergence.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokensKLDivergence.java
@@ -22,10 +22,10 @@ import java.util.Map;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.util.FastMath;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.tokens.CommonTokenCountManager;
import org.apache.tika.eval.tokens.LangModel;
import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.language.detect.LanguageResult;
public class CommonTokensKLDivergence implements LanguageAwareTokenCountStats<Double> {
@@ -36,7 +36,7 @@ public class CommonTokensKLDivergence implements LanguageAwareTokenCountStats<Do
}
@Override
- public Double calculate(List<Language> languages, TokenCounts tokenCounts) {
+ public Double calculate(List<LanguageResult> languages, TokenCounts tokenCounts) {
Pair<String, LangModel> pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage());
LangModel model = pair.getValue();
double kl = 0.0;
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
index a16c767..60c63a6 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
@@ -17,25 +17,20 @@
package org.apache.tika.eval.textstats;
import java.io.IOException;
-import java.security.MessageDigest;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import org.apache.commons.codec.binary.Base64;
-import org.apache.commons.codec.digest.DigestUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.BytesRef;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.langid.LanguageIDWrapper;
import org.apache.tika.eval.tokens.AnalyzerManager;
import org.apache.tika.eval.tokens.TokenCounts;
-import org.apache.tika.metadata.Message;
+import org.apache.tika.language.detect.LanguageResult;
public class CompositeTextStatsCalculator {
@@ -111,7 +106,7 @@ public class CompositeTextStatsCalculator {
}
if (languageAwareTokenCountStats.size() > 0) {
- List<Language> langs = results.containsKey(LanguageIDWrapper.class) ?
+ List<LanguageResult> langs = results.containsKey(LanguageIDWrapper.class) ?
(List)results.get(LanguageIDWrapper.class) : languageIDWrapper.calculate(txt);
results.put(LanguageIDWrapper.class, langs);
for (LanguageAwareTokenCountStats calc : languageAwareTokenCountStats) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/LanguageAwareTokenCountStats.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/LanguageAwareTokenCountStats.java
index 12cb19a..476ce8a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/LanguageAwareTokenCountStats.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/LanguageAwareTokenCountStats.java
@@ -18,13 +18,13 @@ package org.apache.tika.eval.textstats;
import java.util.List;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.language.detect.LanguageResult;
/**
* Interface for calculators that require language probabilities and token stats
* @param <T>
*/
public interface LanguageAwareTokenCountStats<T> extends TextStatsCalculator {
- T calculate(List<Language> languages, TokenCounts tokenCounts);
+ T calculate(List<LanguageResult> languages, TokenCounts tokenCounts);
}
diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
index e9c0cf7..0ed1be8 100644
--- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
@@ -93,7 +93,8 @@
commonTokens="resources/common_tokens"
/>
- <!-- langModelDir="resources/langmodels" -->
+ <!-- this is no longer implemented
+ langModelDir="resources/langmodels" -->
<!-- reporter and interrupter are optional -->
<reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000"
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index bae5792..9dd3c39 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -60,7 +60,6 @@ public class SimpleComparerTest extends TikaTest {
WRITER = new MockDBWriter();
AbstractProfiler.loadCommonTokens(
Paths.get(SimpleComparerTest.class.getResource("/common_tokens").toURI()), "en");
- LanguageIDWrapper.loadBuiltInModels();
}
@Before
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java b/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
index 2740295..43ae3a0 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
@@ -28,10 +28,6 @@ import org.junit.Test;
public class LangIdTest {
- @BeforeClass
- public static void init() throws Exception {
- LanguageIDWrapper.loadBuiltInModels();
- }
@Test
public void testCommonTokensCoverage() throws Exception {
@@ -39,6 +35,7 @@ public class LangIdTest {
//language
LanguageIDWrapper wrapper = new LanguageIDWrapper();
CommonTokenCountManager commonTokens = new CommonTokenCountManager(null, "eng");
+
for (String lang : wrapper.getSupportedLanguages()) {
Set<String> tokens = commonTokens.getTokens(lang);
if (tokens.size() == 0) {
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
index 486791b..f18c018 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -18,9 +18,9 @@ package org.apache.tika.eval.textstats;
import org.apache.commons.codec.binary.Base32;
import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.langid.LanguageIDWrapper;
import org.apache.tika.eval.tokens.CommonTokenResult;
+import org.apache.tika.language.detect.LanguageResult;
import org.junit.Test;
import java.nio.charset.StandardCharsets;
@@ -60,9 +60,9 @@ public class TextStatsTest {
assertEquals(3.12, (double)stats.get(TokenEntropy.class), 0.01);
- List<Language> probabilities = (List<Language>) stats.get(LanguageIDWrapper.class);
+ List<LanguageResult> probabilities = (List<LanguageResult>) stats.get(LanguageIDWrapper.class);
assertEquals("eng", probabilities.get(0).getLanguage());
- assertEquals(0.02, probabilities.get(1).getConfidence(), 0.01);
+ assertEquals(0.02, probabilities.get(1).getRawScore(), 0.01);
String textProfileSignature = (String)stats.get(TextProfileSignature.class);
assertEquals("XF3W27O7IWOJVVNQ4HLKYYPCPPX3L2M72YSEMZ3WADL4VTXVITIA====", textProfileSignature);
@@ -82,9 +82,9 @@ public class TextStatsTest {
Map<Class, Object> stats = calc.calculate(txt);
- List<Language> probabilities = (List<Language>) stats.get(LanguageIDWrapper.class);
+ List<LanguageResult> probabilities = (List<LanguageResult>) stats.get(LanguageIDWrapper.class);
assertEquals("cmn", probabilities.get(0).getLanguage());
- assertEquals(0.009, probabilities.get(1).getConfidence(), 0.01);
+ assertEquals(0.009, probabilities.get(1).getRawScore(), 0.01);
String textProfileSignature = (String)stats.get(TextProfileSignature.class);
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/util/LanguageIdTest.java b/tika-eval/src/test/java/org/apache/tika/eval/util/LanguageIdTest.java
index 2164ff0..54602be 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/util/LanguageIdTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/util/LanguageIdTest.java
@@ -19,8 +19,8 @@ package org.apache.tika.eval.util;
import java.util.List;
-import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.langid.LanguageIDWrapper;
+import org.apache.tika.language.detect.LanguageResult;
import org.junit.Assert;
import org.junit.Test;
@@ -32,9 +32,8 @@ public class LanguageIdTest {
for (int i = 0; i < 50000; i++) {
sb.append("a");
}
- LanguageIDWrapper.loadBuiltInModels();
LanguageIDWrapper wrapper = new LanguageIDWrapper();
- List<Language> languages = wrapper.getProbabilities(sb.toString());
+ List<LanguageResult> languages = wrapper.calculate(sb.toString());
Assert.assertEquals("mri", languages.get(0).getLanguage());
}
}
diff --git a/tika-langdetect/overview.html b/tika-langdetect/overview.html
new file mode 100644
index 0000000..402884b
--- /dev/null
+++ b/tika-langdetect/overview.html
@@ -0,0 +1,24 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+ <title>Apache Tika - Language Detection</title>
+</head>
+<body>
+Collection of language detection modules.
+</body>
+</html>
\ No newline at end of file
diff --git a/tika-langdetect/pom.xml b/tika-langdetect/pom.xml
index b1fa8b3..fcbba61 100644
--- a/tika-langdetect/pom.xml
+++ b/tika-langdetect/pom.xml
@@ -20,10 +20,16 @@
<module>tika-langdetect-lingo24</module>
<module>tika-langdetect-optimaize</module>
<module>tika-langdetect-mitll-text</module>
+ <module>tika-langdetect-opennlp</module>
</modules>
<dependencies>
<dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
diff --git a/tika-langdetect/tika-langdetect-opennlp/.gitignore b/tika-langdetect/tika-langdetect-opennlp/.gitignore
new file mode 100644
index 0000000..224e945
--- /dev/null
+++ b/tika-langdetect/tika-langdetect-opennlp/.gitignore
@@ -0,0 +1 @@
+!opennlp_langdetect_model_20190626.bin
\ No newline at end of file
diff --git a/tika-langdetect/tika-langdetect-opennlp/pom.xml b/tika-langdetect/tika-langdetect-opennlp/pom.xml
new file mode 100644
index 0000000..51e7a7c
--- /dev/null
+++ b/tika-langdetect/tika-langdetect-opennlp/pom.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <artifactId>tika-langdetect</artifactId>
+ <groupId>org.apache.tika</groupId>
+ <version>2.0.0-SNAPSHOT</version>
+ </parent>
+ <modelVersion>4.0.0</modelVersion>
+
+ <artifactId>tika-langdetect-opennlp</artifactId>
+
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>${opennlp.version}</version>
+ </dependency>
+ <!-- test dependencies -->
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-langdetect-commons</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-langdetect-commons</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ <type>test-jar</type>
+ </dependency>
+ </dependencies>
+</project>
\ No newline at end of file
diff --git a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/OpenNLPDetector.java b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/OpenNLPDetector.java
new file mode 100644
index 0000000..6a67bc4
--- /dev/null
+++ b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/OpenNLPDetector.java
@@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.opennlp;
+
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.languagemodel.LanguageModel;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
+import org.apache.tika.language.detect.LanguageConfidence;
+import org.apache.tika.language.detect.LanguageDetector;
+import org.apache.tika.language.detect.LanguageResult;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+/**
+ * <p>
+ * This is based on OpenNLP's language detector. However,
+ * we've built our own ProbingLanguageDetector and our own language
+ * models from an extended Leipzig corpus.
+ * </p>
+ * <p>
+ * Going forward, we plan to fold these improvements into OpenNLP
+ * and remove our own custom code.
+ * </p>
+ */
+public class OpenNLPDetector extends LanguageDetector {
+
+ static LanguageDetectorModel LANG_MODEL;
+
+ static void loadBuiltInModels() throws IOException {
+ try (InputStream is = OpenNLPDetector.class.getResourceAsStream(
+ "/opennlp_langdetect_model_20190626.bin"
+ )) {
+ LANG_MODEL = new LanguageDetectorModel(is);
+ }
+ }
+ static {
+ try {
+ loadBuiltInModels();
+ } catch (IOException e) {
+ throw new RuntimeException("Can't find built-in language models");
+ }
+ }
+
+ private static CharSequenceNormalizer[] getNormalizers() {
+ return new CharSequenceNormalizer[]{
+ TikaUrlCharSequenceNormalizer.getInstance(),
+ AlphaIdeographSequenceNormalizer.getInstance(),
+ EmojiCharSequenceNormalizer.getInstance(),
+ TwitterCharSequenceNormalizer.getInstance(),
+ NumberCharSequenceNormalizer.getInstance(),
+ ShrinkCharSequenceNormalizer.getInstance()
+ };
+ }
+
+ private final ProbingLanguageDetector detector = new ProbingLanguageDetector(LANG_MODEL, getNormalizers());
+ private final StringBuilder buffer = new StringBuilder();
+
+ public OpenNLPDetector() {
+
+ }
+ /**
+ * No-op. Models are loaded statically.
+ * @return
+ * @throws IOException
+ */
+ @Override
+ public LanguageDetector loadModels() throws IOException {
+ return new OpenNLPDetector();
+ }
+
+
+ /**
+ * NOT SUPPORTED. Throws {@link UnsupportedOperationException}
+ * @param languages list of target languages.
+ * @return
+ * @throws IOException
+ */
+ @Override
+ public LanguageDetector loadModels(Set<String> languages) throws IOException {
+ throw new UnsupportedOperationException("This lang detector doesn't allow subsetting models");
+ }
+
+ @Override
+ public boolean hasModel(String language) {
+ for (String lang : detector.getSupportedLanguages()) {
+ if (language.equals(lang)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * NOT YET SUPPORTED. Throws {@link UnsupportedOperationException}
+ * @param languageProbabilities Map from language to probability
+ * @return
+ * @throws IOException
+ */
+ @Override
+ public LanguageDetector setPriors(Map<String, Float> languageProbabilities) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void reset() {
+ buffer.setLength(0);
+ }
+
+ /**
+ * This will buffer up to {@link #setMaxLength(int)} and then
+ * ignore the rest of the text.
+ *
+ * @param cbuf Character buffer
+ * @param off Offset into cbuf to first character in the run of text
+ * @param len Number of characters in the run of text.
+ */
+ @Override
+ public void addText(char[] cbuf, int off, int len) {
+ int buffLen = buffer.length();
+ int newLen = Math.min(len, detector.getMaxLength()-buffLen);
+ if (len <= 0) {
+ return;
+ }
+ buffer.append(cbuf, off, newLen);
+ }
+
+ @Override
+ public List<LanguageResult> detectAll() {
+ Language[] langs = detector.predictLanguages(buffer.toString());
+ List<LanguageResult> results = new ArrayList<>();
+ for (int i = 0; i < langs.length; i++) {
+ LanguageResult r = new LanguageResult(langs[i].getLang(), getConfidence(langs[i].getConfidence()),
+ (float)langs[i].getConfidence());
+ results.add(r);
+ }
+ return results;
+ }
+
+ public void setMaxLength(int maxLength) {
+ detector.setMaxLength(maxLength);
+ }
+
+ public String[] getSupportedLanguages() {
+ return detector.getSupportedLanguages();
+ }
+
+ private static LanguageConfidence getConfidence(double confidence) {
+ //COMPLETELY heuristic
+ if (confidence > 0.9) {
+ return LanguageConfidence.HIGH;
+ } else if (confidence > 0.85) {
+ return LanguageConfidence.MEDIUM;
+ } else if (confidence > 0.20) {
+ return LanguageConfidence.LOW;
+ }
+ return LanguageConfidence.NONE;
+ }
+
+ private static class TikaUrlCharSequenceNormalizer implements CharSequenceNormalizer {
+ //use this custom copy/paste of opennlp to avoid long, long hang with mail_regex
+ //TIKA-2777
+ private static final Pattern URL_REGEX = Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]{10,10000}");
+ private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]{1,100}@[-_0-9A-Za-z]{1,100}[-_.0-9A-Za-z]{1,100}");
+ private static final TikaUrlCharSequenceNormalizer INSTANCE = new TikaUrlCharSequenceNormalizer();
+
+ public static TikaUrlCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ private TikaUrlCharSequenceNormalizer() {
+ }
+
+ @Override
+ public CharSequence normalize(CharSequence charSequence) {
+ String modified = URL_REGEX.matcher(charSequence).replaceAll(" ");
+ return MAIL_REGEX.matcher(modified).replaceAll(" ");
+ }
+ }
+
+ private static class AlphaIdeographSequenceNormalizer implements CharSequenceNormalizer {
+ private static final Pattern REGEX = Pattern.compile("[^\\p{IsAlphabetic}\\p{IsIdeographic}]+");
+ private static final AlphaIdeographSequenceNormalizer INSTANCE = new AlphaIdeographSequenceNormalizer();
+
+ public static AlphaIdeographSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ private AlphaIdeographSequenceNormalizer() {
+ }
+
+ @Override
+ public CharSequence normalize(CharSequence charSequence) {
+ return REGEX.matcher(charSequence).replaceAll(" ");
+ }
+ }
+}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/ProbingLanguageDetector.java
similarity index 97%
rename from tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java
rename to tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/ProbingLanguageDetector.java
index 12e9e27..5d801b0 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java
+++ b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/ProbingLanguageDetector.java
@@ -1,4 +1,4 @@
-package org.apache.tika.eval.langid;
+package org.apache.tika.langdetect.opennlp;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -26,12 +26,6 @@ import opennlp.tools.langdetect.LanguageDetector;
import opennlp.tools.langdetect.LanguageDetectorModel;
import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
-import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
-import org.apache.commons.lang3.mutable.MutableInt;
/**
* Implements learnable Language Detector.
@@ -102,6 +96,7 @@ class ProbingLanguageDetector implements LanguageDetector {
private CharSequenceNormalizer normalizer;
private LanguageDetectorModel model;
+
/**
* Initializes the current instance with a language detector model. Default feature
* generation is used.
@@ -428,4 +423,19 @@ class ProbingLanguageDetector implements LanguageDetector {
}
}
}
+
+ private static class MutableInt {
+ private int i;
+ MutableInt() {
+ this(0);
+ }
+
+ MutableInt(int i) {
+ this.i = i;
+ }
+
+ void increment() {
+ i++;
+ }
+ }
}
diff --git a/tika-eval/src/main/resources/opennlp/model_20190626.bin b/tika-langdetect/tika-langdetect-opennlp/src/main/resources/opennlp_langdetect_model_20190626.bin
similarity index 100%
rename from tika-eval/src/main/resources/opennlp/model_20190626.bin
rename to tika-langdetect/tika-langdetect-opennlp/src/main/resources/opennlp_langdetect_model_20190626.bin
diff --git a/tika-langdetect/tika-langdetect-opennlp/src/test/java/org/apache/tika/langdetect/opennlp/OpenNLPDetectorTest.java b/tika-langdetect/tika-langdetect-opennlp/src/test/java/org/apache/tika/langdetect/opennlp/OpenNLPDetectorTest.java
new file mode 100644
index 0000000..05dc334
--- /dev/null
+++ b/tika-langdetect/tika-langdetect-opennlp/src/test/java/org/apache/tika/langdetect/opennlp/OpenNLPDetectorTest.java
@@ -0,0 +1,68 @@
+package org.apache.tika.langdetect.opennlp;
+
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.langdetect.LanguageDetectorTest;
+import org.apache.tika.language.detect.LanguageResult;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+public class OpenNLPDetectorTest {
+
+ static Map<String, String> OPTIMAIZE_TO_OPENNLP = new HashMap<>();
+
+ @BeforeClass
+ public static void setUp() {
+ OPTIMAIZE_TO_OPENNLP.put("da", "dan");
+ OPTIMAIZE_TO_OPENNLP.put("de", "deu");
+ OPTIMAIZE_TO_OPENNLP.put("el", "ell");
+ OPTIMAIZE_TO_OPENNLP.put("en", "eng");
+ OPTIMAIZE_TO_OPENNLP.put("es", "spa");
+ OPTIMAIZE_TO_OPENNLP.put("et", "est");
+ OPTIMAIZE_TO_OPENNLP.put("fi", "fin");
+ OPTIMAIZE_TO_OPENNLP.put("fr", "fra");
+ OPTIMAIZE_TO_OPENNLP.put("it", "ita");
+ OPTIMAIZE_TO_OPENNLP.put("ja", "jpn");
+ OPTIMAIZE_TO_OPENNLP.put("lt", "lit");
+ OPTIMAIZE_TO_OPENNLP.put("nl", "nld");
+ OPTIMAIZE_TO_OPENNLP.put("pt", "por");
+ OPTIMAIZE_TO_OPENNLP.put("sv", "swe");
+ OPTIMAIZE_TO_OPENNLP.put("th", "tha");
+ OPTIMAIZE_TO_OPENNLP.put("zh", "cmn");
+ }
+
+ @Test
+ public void languageTests() throws Exception {
+ OpenNLPDetector detector = new OpenNLPDetector();
+ for (String lang : OPTIMAIZE_TO_OPENNLP.keySet()) {
+ String openNLPLang = OPTIMAIZE_TO_OPENNLP.get(lang);
+ detector.addText(getLangText(lang));
+ List<LanguageResult> results = detector.detectAll();
+ assertEquals(openNLPLang, results.get(0).getLanguage());
+ detector.reset();
+ }
+ }
+
+ private CharSequence getLangText(String lang) throws IOException {
+ try (Reader reader = new InputStreamReader(
+ LanguageDetectorTest.class.getResourceAsStream("language-tests/"+lang+".test")
+ , StandardCharsets.UTF_8)) {
+ return IOUtils.toString(reader);
+ }
+ }
+
+}