You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/10 11:21:55 UTC

[tika] 01/02: TIKA-2790 -- make sure lengths to calculate advance are based on the original string, not the normalized string

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2153a0279cbfafffe707fce7b04e6d48216cddf8
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Jul 10 07:17:19 2019 -0400

    TIKA-2790 -- make sure lengths to calculate advance
    are based on the original string, not the normalized string
---
 .../tika/eval/langid/ProbingLanguageDetector.java  | 28 +++++++++++++++-------
 .../org/apache/tika/eval/langid/LangIdTest.java    |  7 +++++-
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java b/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java
index 84d9805..12e9e27 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java
@@ -53,7 +53,7 @@ import org.apache.commons.lang3.mutable.MutableInt;
  * </p>
  *
  */
-public class ProbingLanguageDetector implements LanguageDetector {
+class ProbingLanguageDetector implements LanguageDetector {
 
     /**
      * Default chunk size (in codepoints) to take from the
@@ -73,7 +73,7 @@ public class ProbingLanguageDetector implements LanguageDetector {
      * Default minimum difference in confidence between the language with
      * the highest confidence and the language with the second highest confidence.
      */
-    public static final double DEFAULT_MIN_DIFF = 0.10;
+    public static final double DEFAULT_MIN_DIFF = 0.20;
 
     /**
      * Default absolute maximum length of the String (in codepoints) to process
@@ -131,17 +131,17 @@ public class ProbingLanguageDetector implements LanguageDetector {
         int nGrams = 0;
         while (true) {
             int actualChunkSize = (start + chunkSize > maxLength) ? maxLength - start : chunkSize;
-            CharSequence normed = chunk(content, start, actualChunkSize);
 
-            int[] chunk = normed.codePoints().toArray();
-            if (chunk.length == 0) {
+            CSAndLength csAndLength = chunk(content, start, actualChunkSize);
+            int[] chunk = csAndLength.normed.codePoints().toArray();
+            if (csAndLength.originalLength == 0) {
                 if (currPredictions == null) {
                     return predict(ngramCounts);
                 } else {
                     return currPredictions;
                 }
             }
-            start += chunk.length;
+            start += csAndLength.originalLength;
             ngrammer.reset(chunk);
 
             while (ngrammer.hasNext()) {
@@ -338,15 +338,25 @@ public class ProbingLanguageDetector implements LanguageDetector {
         return true;
     }
 
-    private CharSequence chunk(CharSequence content, int start, int chunkSize) {
+    private CSAndLength chunk(CharSequence content, int start, int chunkSize) {
         if (start == 0 && chunkSize > content.length()) {
-            return normalizer.normalize(content);
+            int length = content.codePoints().toArray().length;
+            return new CSAndLength(normalizer.normalize(content), length);
         }
         int[] codepoints = content.codePoints().skip(start).limit(chunkSize).toArray();
         String chunk = new String(codepoints, 0, codepoints.length);
-        return normalizer.normalize(chunk);
+        return new CSAndLength(normalizer.normalize(chunk), codepoints.length);
     }
 
+    private static class CSAndLength {
+        private final CharSequence normed;
+        private final int originalLength;
+
+        public CSAndLength(CharSequence normed, int originalLength) {
+            this.normed = normed;
+            this.originalLength = originalLength;
+        }
+    }
     private static class CharIntNGrammer implements Iterator<String> {
         private String next;
         private int pos = 0;
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java b/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
index e845b5b..2740295 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
@@ -23,15 +23,20 @@ import java.util.Locale;
 import java.util.Set;
 
 import org.apache.tika.eval.tokens.CommonTokenCountManager;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 public class LangIdTest {
 
+    @BeforeClass
+    public static void init() throws Exception {
+        LanguageIDWrapper.loadBuiltInModels();
+    }
+
     @Test
     public void testCommonTokensCoverage() throws Exception {
         //make sure that there is a common tokens file for every
         //language
-        LanguageIDWrapper.loadBuiltInModels();
         LanguageIDWrapper wrapper = new LanguageIDWrapper();
         CommonTokenCountManager commonTokens = new CommonTokenCountManager(null, "eng");
         for (String lang : wrapper.getSupportedLanguages()) {