You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/10 11:21:54 UTC

[tika] branch master updated (9a8ef5e -> ce693ed)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 9a8ef5e  TIKA-2898 -- fix for master vs branch_1x diff
     new 2153a02  TIKA-2790 -- make sure lengths to calculate advance are based on the original string, not the normalized string
     new ce693ed  TIKA-2903 -- close RereadableStream's storeOutputStream via Peter Fassev

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../apache/tika/utils/RereadableInputStream.java   |  6 +++++
 .../tika/eval/langid/ProbingLanguageDetector.java  | 28 +++++++++++++++-------
 .../org/apache/tika/eval/langid/LangIdTest.java    |  7 +++++-
 3 files changed, 31 insertions(+), 10 deletions(-)


[tika] 02/02: TIKA-2903 -- close RereadableStream's storeOutputStream via Peter Fassev

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ce693ed2acf5f70c6e8d919229be96456bf0ce9e
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Jul 10 07:21:20 2019 -0400

    TIKA-2903 -- close RereadableStream's storeOutputStream via Peter Fassev
---
 .../src/main/java/org/apache/tika/utils/RereadableInputStream.java  | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java b/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java
index f352093..b3a528f 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java
@@ -219,6 +219,12 @@ public class RereadableInputStream extends InputStream {
      */
     public void close() throws IOException {
         closeStream();
+
+        if (storeOutputStream != null) {
+            storeOutputStream.close();
+            storeOutputStream = null;
+        }
+
         super.close();
         if (storeFile != null) {
             storeFile.delete();


[tika] 01/02: TIKA-2790 -- make sure lengths to calculate advance are based on the original string, not the normalized string

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2153a0279cbfafffe707fce7b04e6d48216cddf8
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed Jul 10 07:17:19 2019 -0400

    TIKA-2790 -- make sure lengths to calculate advance
    are based on the original string, not the normalized string
---
 .../tika/eval/langid/ProbingLanguageDetector.java  | 28 +++++++++++++++-------
 .../org/apache/tika/eval/langid/LangIdTest.java    |  7 +++++-
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java b/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java
index 84d9805..12e9e27 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/langid/ProbingLanguageDetector.java
@@ -53,7 +53,7 @@ import org.apache.commons.lang3.mutable.MutableInt;
  * </p>
  *
  */
-public class ProbingLanguageDetector implements LanguageDetector {
+class ProbingLanguageDetector implements LanguageDetector {
 
     /**
      * Default chunk size (in codepoints) to take from the
@@ -73,7 +73,7 @@ public class ProbingLanguageDetector implements LanguageDetector {
      * Default minimum difference in confidence between the language with
      * the highest confidence and the language with the second highest confidence.
      */
-    public static final double DEFAULT_MIN_DIFF = 0.10;
+    public static final double DEFAULT_MIN_DIFF = 0.20;
 
     /**
      * Default absolute maximum length of the String (in codepoints) to process
@@ -131,17 +131,17 @@ public class ProbingLanguageDetector implements LanguageDetector {
         int nGrams = 0;
         while (true) {
             int actualChunkSize = (start + chunkSize > maxLength) ? maxLength - start : chunkSize;
-            CharSequence normed = chunk(content, start, actualChunkSize);
 
-            int[] chunk = normed.codePoints().toArray();
-            if (chunk.length == 0) {
+            CSAndLength csAndLength = chunk(content, start, actualChunkSize);
+            int[] chunk = csAndLength.normed.codePoints().toArray();
+            if (csAndLength.originalLength == 0) {
                 if (currPredictions == null) {
                     return predict(ngramCounts);
                 } else {
                     return currPredictions;
                 }
             }
-            start += chunk.length;
+            start += csAndLength.originalLength;
             ngrammer.reset(chunk);
 
             while (ngrammer.hasNext()) {
@@ -338,15 +338,25 @@ public class ProbingLanguageDetector implements LanguageDetector {
         return true;
     }
 
-    private CharSequence chunk(CharSequence content, int start, int chunkSize) {
+    private CSAndLength chunk(CharSequence content, int start, int chunkSize) {
         if (start == 0 && chunkSize > content.length()) {
-            return normalizer.normalize(content);
+            int length = content.codePoints().toArray().length;
+            return new CSAndLength(normalizer.normalize(content), length);
         }
         int[] codepoints = content.codePoints().skip(start).limit(chunkSize).toArray();
         String chunk = new String(codepoints, 0, codepoints.length);
-        return normalizer.normalize(chunk);
+        return new CSAndLength(normalizer.normalize(chunk), codepoints.length);
     }
 
+    private static class CSAndLength {
+        private final CharSequence normed;
+        private final int originalLength;
+
+        public CSAndLength(CharSequence normed, int originalLength) {
+            this.normed = normed;
+            this.originalLength = originalLength;
+        }
+    }
     private static class CharIntNGrammer implements Iterator<String> {
         private String next;
         private int pos = 0;
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java b/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
index e845b5b..2740295 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/langid/LangIdTest.java
@@ -23,15 +23,20 @@ import java.util.Locale;
 import java.util.Set;
 
 import org.apache.tika.eval.tokens.CommonTokenCountManager;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 public class LangIdTest {
 
+    @BeforeClass
+    public static void init() throws Exception {
+        LanguageIDWrapper.loadBuiltInModels();
+    }
+
     @Test
     public void testCommonTokensCoverage() throws Exception {
         //make sure that there is a common tokens file for every
         //language
-        LanguageIDWrapper.loadBuiltInModels();
         LanguageIDWrapper wrapper = new LanguageIDWrapper();
         CommonTokenCountManager commonTokens = new CommonTokenCountManager(null, "eng");
         for (String lang : wrapper.getSupportedLanguages()) {