You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/12/14 21:35:14 UTC

[tika] branch master updated: TIKA-2800 -- add num unique alphabetic tokens and num unique common tokens

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new c7f292b  TIKA-2800 -- add num unique alphabetic tokens and num unique common tokens
c7f292b is described below

commit c7f292b5abb08096f6f4870326a16929cb326a33
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Dec 14 16:34:58 2018 -0500

    TIKA-2800 -- add num unique alphabetic tokens and num unique common tokens
---
 .../org/apache/tika/eval/AbstractProfiler.java     |  3 +++
 .../java/org/apache/tika/eval/ExtractProfiler.java |  4 +++-
 .../main/java/org/apache/tika/eval/db/Cols.java    |  2 ++
 .../tika/eval/tokens/CommonTokenCountManager.java  | 15 ++++++++----
 .../apache/tika/eval/tokens/CommonTokenResult.java | 27 +++++++++++++++++++++-
 .../src/main/resources/comparison-reports.xml      |  4 ++--
 .../org/apache/tika/eval/SimpleComparerTest.java   |  3 +++
 7 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index aa999dd..307c54e 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -389,12 +389,15 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
             LOG.error("{}", e.getMessage(), e);
         }
         data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
+        data.put(Cols.NUM_UNIQUE_COMMON_TOKENS, Integer.toString(commonTokenResult.getUniqueCommonTokens()));
         data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
         TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName);
         data.put(Cols.NUM_UNIQUE_TOKENS,
                 Integer.toString(tokenStatistics.getTotalUniqueTokens()));
         data.put(Cols.NUM_TOKENS,
                 Integer.toString(tokenStatistics.getTotalTokens()));
+        data.put(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS,
+                Integer.toString(commonTokenResult.getUniqueAlphabeticTokens()));
         data.put(Cols.NUM_ALPHABETIC_TOKENS,
                 Integer.toString(commonTokenResult.getAlphabeticTokens()));
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index ccb5011..d1b1ac6 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -135,10 +135,12 @@ public class ExtractProfiler extends AbstractProfiler {
     public static TableInfo CONTENTS_TABLE = new TableInfo("contents",
             new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
             new ColInfo(Cols.CONTENT_LENGTH, Types.INTEGER),
-            new ColInfo(Cols.NUM_TOKENS, Types.INTEGER),
             new ColInfo(Cols.NUM_UNIQUE_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.NUM_TOKENS, Types.INTEGER),
             new ColInfo(Cols.COMMON_TOKENS_LANG, Types.VARCHAR, 12),
+            new ColInfo(Cols.NUM_UNIQUE_COMMON_TOKENS, Types.INTEGER),
             new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER),
             new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER),
             new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024),
             new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12),
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index 3fa8cb5..db8c1d0 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -45,8 +45,10 @@ public enum Cols {
     CONTENT_LENGTH,
     NUM_UNIQUE_TOKENS,
     NUM_TOKENS,
+    NUM_UNIQUE_ALPHABETIC_TOKENS,
     NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens
     COMMON_TOKENS_LANG, //which language was used for the common tokens metric?
+    NUM_UNIQUE_COMMON_TOKENS,
     NUM_COMMON_TOKENS,
     TOP_N_TOKENS,
     LANG_ID_1,
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
index 1fc2ca3..5546540 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
@@ -65,21 +65,26 @@ public class CommonTokenCountManager {
     public CommonTokenResult countTokenOverlaps(String langCode,
                                                 Map<String, MutableInt> tokens) throws IOException {
         String actualLangCode = getActualLangCode(langCode);
-        int overlap = 0;
-        int alphabeticTokens = 0;
+        int numUniqueCommonTokens = 0;
+        int numCommonTokens = 0;
+        int numUniqueAlphabeticTokens = 0;
+        int numAlphabeticTokens = 0;
         Set<String> commonTokens = commonTokenMap.get(actualLangCode);
         for (Map.Entry<String, MutableInt> e : tokens.entrySet()) {
             String token = e.getKey();
             int count = e.getValue().intValue();
             if (AlphaIdeographFilterFactory.isAlphabetic(token.toCharArray())) {
-                alphabeticTokens += count;
+                numAlphabeticTokens += count;
+                numUniqueAlphabeticTokens++;
             }
             if (commonTokens.contains(token)) {
-                overlap += count;
+                numCommonTokens += count;
+                numUniqueCommonTokens++;
             }
 
         }
-        return new CommonTokenResult(actualLangCode, overlap, alphabeticTokens);
+        return new CommonTokenResult(actualLangCode, numUniqueCommonTokens,
+                numCommonTokens, numUniqueAlphabeticTokens, numAlphabeticTokens);
     }
 
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
index 317697a..f146153 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
@@ -20,12 +20,17 @@ package org.apache.tika.eval.tokens;
 public class CommonTokenResult {
 
     private final String langCode;
+    private final int uniqueCommonTokens;//types
     private final int commonTokens;
+    private final int uniqueAlphabeticTokens;
     private final int alphabeticTokens;
 
-    public CommonTokenResult(String langCode, int commonTokens, int alphabeticTokens) {
+    public CommonTokenResult(String langCode, int uniqueCommonTokens, int commonTokens,
+                             int uniqueAlphabeticTokens, int alphabeticTokens) {
         this.langCode = langCode;
+        this.uniqueCommonTokens = uniqueCommonTokens;
         this.commonTokens = commonTokens;
+        this.uniqueAlphabeticTokens = uniqueAlphabeticTokens;
         this.alphabeticTokens = alphabeticTokens;
     }
 
@@ -33,12 +38,32 @@ public class CommonTokenResult {
         return langCode;
     }
 
+    /**
+     *
+     * @return total number of "common tokens"
+     */
     public int getCommonTokens() {
         return commonTokens;
     }
 
     /**
      *
+     * @return number of unique "common tokens" (types)
+     */
+    public int getUniqueCommonTokens() {
+        return uniqueCommonTokens;
+    }
+
+    /**
+     *
+     * @return number of unique alphabetic tokens (types)
+     */
+    public int getUniqueAlphabeticTokens() {
+        return uniqueAlphabeticTokens;
+    }
+
+    /**
+     *
      * @return number of tokens that had at least one alphabetic/ideographic character
      * whether or not a common token
      */
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index 10fd9e3..eaf3bb6 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -765,8 +765,8 @@
             limit 100000
         </sql>
     </report>
-    <report reportName="contentDiffsIgnoreExceptions"
-            reportFilename="content/content_diffs_ignore_exceptions.xlsx"
+    <report reportName="contentDiffsNoExceptions"
+            reportFilename="content/content_diffs_no_exceptions.xlsx"
             format="xlsx"
             includeSql="true">
 
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index 78a8ca5..d54d41c 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -96,7 +96,9 @@ public class SimpleComparerTest extends TikaTest {
         assertEquals("70", row.get(Cols.CONTENT_LENGTH));
         assertEquals("10", row.get(Cols.NUM_UNIQUE_TOKENS));
         assertEquals("14", row.get(Cols.NUM_TOKENS));
+        assertEquals("8", row.get(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS));
         assertEquals("12", row.get(Cols.NUM_ALPHABETIC_TOKENS));
+        assertEquals("3", row.get(Cols.NUM_UNIQUE_COMMON_TOKENS));
         assertEquals("6", row.get(Cols.NUM_COMMON_TOKENS));
         assertEquals("57", row.get(Cols.TOKEN_LENGTH_SUM));
         assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
@@ -106,6 +108,7 @@ public class SimpleComparerTest extends TikaTest {
         assertEquals("76", row.get(Cols.CONTENT_LENGTH));
         assertEquals("9", row.get(Cols.NUM_UNIQUE_TOKENS));
         assertEquals("13", row.get(Cols.NUM_TOKENS));
+        assertEquals("3", row.get(Cols.NUM_UNIQUE_COMMON_TOKENS));
         assertEquals("4", row.get(Cols.NUM_COMMON_TOKENS));
         assertEquals("64", row.get(Cols.TOKEN_LENGTH_SUM));
         assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));