You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/12 20:57:46 UTC

[tika] branch 2.x updated (6b9e36e -> d2907f4)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git.

      from  6b9e36e   TIKA-2323
       new  110247f   turn off debug statement
       new  d2907f4   TIKA-2325

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "adds" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../org/apache/tika/eval/AbstractProfiler.java     | 21 ++++++++++++----
 .../java/org/apache/tika/eval/ExtractComparer.java |  1 +
 .../java/org/apache/tika/eval/ExtractProfiler.java |  1 +
 .../tika/eval/batch/EvalConsumersBuilder.java      |  6 ++++-
 .../tika/eval/tokens/CommonTokenCountManager.java  |  5 ++--
 .../main/resources/tika-eval-comparison-config.xml |  2 ++
 .../main/resources/tika-eval-profiler-config.xml   |  2 ++
 .../org/apache/tika/eval/SimpleComparerTest.java   | 28 ++++++++++++++++++++--
 .../java/org/apache/tika/eval/TikaEvalCLITest.java |  1 -
 .../extractsA/file13_attachANotB.doc.json          |  3 ++-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  1 -
 11 files changed, 59 insertions(+), 12 deletions(-)

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].

[tika] 01/02: turn off debug statement

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 110247fcf0f77a85aa0385dae93a51f8612b43db
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Apr 12 16:49:33 2017 -0400

    turn off debug statement
---
 .../src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java          | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 3e655dc..cf0947f 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -652,7 +652,6 @@ public class PDFParserTest extends TikaTest {
         context.set(PDFParserConfig.class, config);
 
         List<Metadata> metadatas = getRecursiveMetadata("testPDF_JBIG2.pdf", context);
-        debug(metadatas);
         assertContains("test images compressed using JBIG2",
                 metadatas.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
 

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.

[tika] 02/02: TIKA-2325

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d2907f41a72f13d3ec877e736bb299999ef86ff7
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Apr 12 16:57:37 2017 -0400

    TIKA-2325
---
 .../org/apache/tika/eval/AbstractProfiler.java     | 21 ++++++++++++----
 .../java/org/apache/tika/eval/ExtractComparer.java |  1 +
 .../java/org/apache/tika/eval/ExtractProfiler.java |  1 +
 .../tika/eval/batch/EvalConsumersBuilder.java      |  6 ++++-
 .../tika/eval/tokens/CommonTokenCountManager.java  |  5 ++--
 .../main/resources/tika-eval-comparison-config.xml |  2 ++
 .../main/resources/tika-eval-profiler-config.xml   |  2 ++
 .../org/apache/tika/eval/SimpleComparerTest.java   | 28 ++++++++++++++++++++--
 .../java/org/apache/tika/eval/TikaEvalCLITest.java |  1 -
 .../extractsA/file13_attachANotB.doc.json          |  3 ++-
 10 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 9a96e5b..a9ff1a5 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -162,8 +162,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
      * @param p path to the common_tokens directory.  If this is null, try to load from classPath
      * @throws IOException
      */
-    public static void loadCommonTokens(Path p) throws IOException {
-        commonTokenCountManager = new CommonTokenCountManager(p);
+    public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException {
+        commonTokenCountManager = new CommonTokenCountManager(p, defaultLangCode);
     }
 
     public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue,
@@ -541,16 +541,29 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         }
         List<DetectedLanguage> probabilities = langIder.getProbabilities(s);
         if (probabilities.size() > 0) {
-            data.put(Cols.LANG_ID_1, probabilities.get(0).getLocale().getLanguage());
+            data.put(Cols.LANG_ID_1, getLangString(probabilities.get(0)));
             data.put(Cols.LANG_ID_PROB_1,
             Double.toString(probabilities.get(0).getProbability()));
         }
         if (probabilities.size() > 1) {
-            data.put(Cols.LANG_ID_2, probabilities.get(1).getLocale().getLanguage());
+            data.put(Cols.LANG_ID_2, getLangString(probabilities.get(1)));
             data.put(Cols.LANG_ID_PROB_2,
             Double.toString(probabilities.get(1).getProbability()));
         }
+    }
 
+    private String getLangString(DetectedLanguage detectedLanguage) {
+        //So that we have mapping between lang id and common-tokens file names
+        String lang = detectedLanguage.getLocale().getLanguage();
+        if ("zh".equals(lang)) {
+            if (detectedLanguage.getLocale().getRegion().isPresent()) {
+                lang += "-" + detectedLanguage.getLocale().getRegion().get().toLowerCase(Locale.US);
+            } else {
+                //hope for the best
+                lang += "-cn";
+            }
+        }
+        return lang;
     }
 
     void getFileTypes(Metadata metadata, Map<Cols, String> output) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index 65606d0..7b006df 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -83,6 +83,7 @@ public class ExtractComparer extends AbstractProfiler {
                 .addOption("maxTokens", true, "maximum tokens to process, default=200000")
                 .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
                 .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
+                .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")
         ;
     }
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index 514778f..d5f9af3 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -71,6 +71,7 @@ public class ExtractProfiler extends AbstractProfiler {
                 .addOption("maxTokens", true, "maximum tokens to process, default=200000")
                 .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
                 .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
+                .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")
 
         ;
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
index 55bb523..a5a912f 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
@@ -70,9 +70,13 @@ public class EvalConsumersBuilder extends AbstractConsumersBuilder {
         }
 
         Path commonTokens = getPath(localAttrs, "commonTokens");
+        String defaultLangCode = localAttrs.get("defaultLangCode");
+        if (defaultLangCode == null || "".equals(defaultLangCode)) {
+            defaultLangCode = "en";
+        }
         //can be null, in which case will load from memory
         try {
-            AbstractProfiler.loadCommonTokens(commonTokens);
+            AbstractProfiler.loadCommonTokens(commonTokens, defaultLangCode);
         } catch (IOException e) {
             throw new RuntimeException(e);
         }
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
index 9997152..e9248f6 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
@@ -47,9 +47,10 @@ public class CommonTokenCountManager {
 
     //if we have no model or if no langid is passed in
     //make this configurable
-    String defaultLangCode = "en";
+    private final String defaultLangCode;
 
-    public CommonTokenCountManager(Path commonTokensDir) throws IOException {
+    public CommonTokenCountManager(Path commonTokensDir, String defaultLangCode) throws IOException {
+        this.defaultLangCode = defaultLangCode;
         this.commonTokensDir = commonTokensDir;
         tryToLoad(defaultLangCode);
         //if you couldn't load it, make sure to add an empty
diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
index 887a3e7..1ddcda2 100644
--- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
@@ -64,6 +64,8 @@
                 description="truncate content beyond this length for calculating 'contents' stats, default=1000000"/>
         <option opt="maxContentLengthForLangId" hasArg="true"
                 description="truncate content beyond this length for language id, default=50000"/>
+        <option opt="defaultLangCode" hasArg="true"
+                description="which language to use for common words if no 'common words' file exists for the langid result"/>
 
 
     </commandline>
diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
index a7e6d03..059ee49 100644
--- a/tika-eval/src/main/resources/tika-eval-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
@@ -59,6 +59,8 @@
                 description="truncate content beyond this length for calculating 'contents' stats, default=1000000"/>
         <option opt="maxContentLengthForLangId" hasArg="true"
                 description="truncate content beyond this length for language id, default=50000"/>
+        <option opt="defaultLangCode" hasArg="true"
+                description="which language to use for common words if no 'common words' file exists for the langid result"/>
 
 
 
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index 1e69ce9..257a607 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -62,7 +62,7 @@ public class SimpleComparerTest extends TikaTest {
                 new ExtractReader(ExtractReader.ALTER_METADATA_LIST.AS_IS,
                         IGNORE_LENGTH, IGNORE_LENGTH),
                 writer);
-        AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath());
+        AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/common_tokens").toPath(), "en");
         LanguageIDWrapper.loadBuiltInModels();
     }
 
@@ -137,6 +137,30 @@ public class SimpleComparerTest extends TikaTest {
 
     }
 
+    @Test
+    public void testChinese() throws Exception {
+        //make sure that language id matches common words
+        //file names.  The test file contains MT'd Simplified Chinese with
+        //known "common words" appended at end.
+
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file13_attachANotB.doc.json"),
+                getResourceAsFile("/test-dirs/extractsA/file13_attachANotB.doc.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("non-existent.json"),
+                getResourceAsFile("/test-dirs/extractsB/non-existent.json").toPath());
+
+        comparer.compareFiles(fpsA, fpsB);
+
+        List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+
+        Map<Cols, String> row = tableInfos.get(0);
+        assertEquals("122", row.get(Cols.TOKEN_LENGTH_SUM));
+        assertEquals("3", row.get(Cols.NUM_COMMON_TOKENS));
+        assertEquals("zh-cn", row.get(Cols.COMMON_TOKENS_LANG));
+
+    }
 
     @Test
     public void testEmpty() throws Exception {
@@ -245,7 +269,7 @@ public class SimpleComparerTest extends TikaTest {
     @Ignore
     public void testDebug() throws Exception {
         Path commonTokens = Paths.get(getResourceAsFile("/common_tokens_short.txt").toURI());
-        AbstractProfiler.loadCommonTokens(commonTokens);
+        AbstractProfiler.loadCommonTokens(commonTokens, "en");
         EvalFilePaths fpsA = new EvalFilePaths(
                 Paths.get("file1.pdf.json"),
                 getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
index 288f042..8151508 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
@@ -42,7 +42,6 @@ import org.junit.Test;
 
 public class TikaEvalCLITest extends TikaTest {
     //TODO: these barely reach the minimal acceptable stage for unit tests
-    //but we have to start somewhere on the integration tests
 
     private static Path extractsDir = Paths.get("src/test/resources/test-dirs");
 
diff --git a/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json
index 5371c87..048c853 100644
--- a/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json
+++ b/tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json
@@ -1,6 +1,7 @@
 [{
   "Content-Type":"text/plain",
-  "X-TIKA:content":"the quick brown fox fox fox jumped over the lazy lazy dog"
+  "_comment" : "simplified",
+  "X-TIKA:content":"\u8c03\u6574\u6bcf\u4e00\u4e2a\u5fc3\u810f\u548c\u6bcf\u4e2a\u58f0\u97f3\uff0c\u6295\u6807\u6bcf\u4e2a\u62a4\u7406\u63d0\u53d6;\u8ba9\u5927\u5bb6\u4e00\u8d77\u6b22\u4e50\uff0c\u8d5e\u7f8e\u8001\u62ff\u9a9a.\u8c03\u6574\u6bcf\u4e00\u4e2a\u5fc3\u810f\u548c\u6bcf\u4e2a\u58f0\u97f3\uff0c\u6295\u6807\u6bcf\u4e2a\u62a4\u7406\u63d0\u53d6;\u8ba9\u5927\u5bb6\u4e00\u8d77\u6b22\u4e50\uff0c\u8d5e\u7f8e\u8001\u62ff\u9a9a \u72d0\u72f8\u72d0\u72f8\u72d0\u72f8 "
   },
   {
     "Content-Type":"text/plain",

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.