You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/10 11:21:55 UTC
[tika] 01/02: TIKA-2790 -- make sure lengths to calculate advance
are based on the original string, not the normalized string
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 2153a0279cbfafffe707fce7b04e6d48216cddf8
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Jul 10 07:17:19 2019 -0400
TIKA-2790 -- make sure lengths to calculate advance
are based on the original string, not the normalized string
---
.../tika/eval/langid/ProbingLanguageDetector.java | 28 +++++++++++++++-------
.../org/apache/tika/eval/langid/LangIdTest.java | 7 +++++-
2 files changed, 25 insertions(+), 10 deletions(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java b/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java
index 84d9805..12e9e27 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java
@@ -53,7 +53,7 @@ import org.apache.commons.lang3.mutable.MutableInt;
* </p>
*
*/
-public class ProbingLanguageDetector implements LanguageDetector {
+class ProbingLanguageDetector implements LanguageDetector {
/**
* Default chunk size (in codepoints) to take from the
@@ -73,7 +73,7 @@ public class ProbingLanguageDetector implements LanguageDetector {
* Default minimum difference in confidence between the language with
* the highest confidence and the language with the second highest confidence.
*/
- public static final double DEFAULT_MIN_DIFF = 0.10;
+ public static final double DEFAULT_MIN_DIFF = 0.20;
/**
* Default absolute maximum length of the String (in codepoints) to process
@@ -131,17 +131,17 @@ public class ProbingLanguageDetector implements LanguageDetector {
int nGrams = 0;
while (true) {
int actualChunkSize = (start + chunkSize > maxLength) ? maxLength - start : chunkSize;
- CharSequence normed = chunk(content, start, actualChunkSize);
- int[] chunk = normed.codePoints().toArray();
- if (chunk.length == 0) {
+ CSAndLength csAndLength = chunk(content, start, actualChunkSize);
+ int[] chunk = csAndLength.normed.codePoints().toArray();
+ if (csAndLength.originalLength == 0) {
if (currPredictions == null) {
return predict(ngramCounts);
} else {
return currPredictions;
}
}
- start += chunk.length;
+ start += csAndLength.originalLength;
ngrammer.reset(chunk);
while (ngrammer.hasNext()) {
@@ -338,15 +338,25 @@ public class ProbingLanguageDetector implements LanguageDetector {
return true;
}
- private CharSequence chunk(CharSequence content, int start, int chunkSize) {
+ private CSAndLength chunk(CharSequence content, int start, int chunkSize) {
if (start == 0 && chunkSize > content.length()) {
- return normalizer.normalize(content);
+ int length = content.codePoints().toArray().length;
+ return new CSAndLength(normalizer.normalize(content), length);
}
int[] codepoints = content.codePoints().skip(start).limit(chunkSize).toArray();
String chunk = new String(codepoints, 0, codepoints.length);
- return normalizer.normalize(chunk);
+ return new CSAndLength(normalizer.normalize(chunk), codepoints.length);
}
+ private static class CSAndLength {
+ private final CharSequence normed;
+ private final int originalLength;
+
+ public CSAndLength(CharSequence normed, int originalLength) {
+ this.normed = normed;
+ this.originalLength = originalLength;
+ }
+ }
private static class CharIntNGrammer implements Iterator<String> {
private String next;
private int pos = 0;
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java b/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
index e845b5b..2740295 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
@@ -23,15 +23,20 @@ import java.util.Locale;
import java.util.Set;
import org.apache.tika.eval.tokens.CommonTokenCountManager;
+import org.junit.BeforeClass;
import org.junit.Test;
public class LangIdTest {
+ @BeforeClass
+ public static void init() throws Exception {
+ LanguageIDWrapper.loadBuiltInModels();
+ }
+
@Test
public void testCommonTokensCoverage() throws Exception {
//make sure that there is a common tokens file for every
//language
- LanguageIDWrapper.loadBuiltInModels();
LanguageIDWrapper wrapper = new LanguageIDWrapper();
CommonTokenCountManager commonTokens = new CommonTokenCountManager(null, "eng");
for (String lang : wrapper.getSupportedLanguages()) {