You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/12/14 21:35:14 UTC
[tika] branch master updated: TIKA-2800 -- add num unique
alphabetic tokens and num unique common tokens
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new c7f292b TIKA-2800 -- add num unique alphabetic tokens and num unique common tokens
c7f292b is described below
commit c7f292b5abb08096f6f4870326a16929cb326a33
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Dec 14 16:34:58 2018 -0500
TIKA-2800 -- add num unique alphabetic tokens and num unique common tokens
---
.../org/apache/tika/eval/AbstractProfiler.java | 3 +++
.../java/org/apache/tika/eval/ExtractProfiler.java | 4 +++-
.../main/java/org/apache/tika/eval/db/Cols.java | 2 ++
.../tika/eval/tokens/CommonTokenCountManager.java | 15 ++++++++----
.../apache/tika/eval/tokens/CommonTokenResult.java | 27 +++++++++++++++++++++-
.../src/main/resources/comparison-reports.xml | 4 ++--
.../org/apache/tika/eval/SimpleComparerTest.java | 3 +++
7 files changed, 49 insertions(+), 9 deletions(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index aa999dd..307c54e 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -389,12 +389,15 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
LOG.error("{}", e.getMessage(), e);
}
data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
+ data.put(Cols.NUM_UNIQUE_COMMON_TOKENS, Integer.toString(commonTokenResult.getUniqueCommonTokens()));
data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName);
data.put(Cols.NUM_UNIQUE_TOKENS,
Integer.toString(tokenStatistics.getTotalUniqueTokens()));
data.put(Cols.NUM_TOKENS,
Integer.toString(tokenStatistics.getTotalTokens()));
+ data.put(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS,
+ Integer.toString(commonTokenResult.getUniqueAlphabeticTokens()));
data.put(Cols.NUM_ALPHABETIC_TOKENS,
Integer.toString(commonTokenResult.getAlphabeticTokens()));
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index ccb5011..d1b1ac6 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -135,10 +135,12 @@ public class ExtractProfiler extends AbstractProfiler {
public static TableInfo CONTENTS_TABLE = new TableInfo("contents",
new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
new ColInfo(Cols.CONTENT_LENGTH, Types.INTEGER),
- new ColInfo(Cols.NUM_TOKENS, Types.INTEGER),
new ColInfo(Cols.NUM_UNIQUE_TOKENS, Types.INTEGER),
+ new ColInfo(Cols.NUM_TOKENS, Types.INTEGER),
new ColInfo(Cols.COMMON_TOKENS_LANG, Types.VARCHAR, 12),
+ new ColInfo(Cols.NUM_UNIQUE_COMMON_TOKENS, Types.INTEGER),
new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER),
+ new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER),
new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER),
new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024),
new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12),
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index 3fa8cb5..db8c1d0 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -45,8 +45,10 @@ public enum Cols {
CONTENT_LENGTH,
NUM_UNIQUE_TOKENS,
NUM_TOKENS,
+ NUM_UNIQUE_ALPHABETIC_TOKENS,
NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens
COMMON_TOKENS_LANG, //which language was used for the common tokens metric?
+ NUM_UNIQUE_COMMON_TOKENS,
NUM_COMMON_TOKENS,
TOP_N_TOKENS,
LANG_ID_1,
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
index 1fc2ca3..5546540 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
@@ -65,21 +65,26 @@ public class CommonTokenCountManager {
public CommonTokenResult countTokenOverlaps(String langCode,
Map<String, MutableInt> tokens) throws IOException {
String actualLangCode = getActualLangCode(langCode);
- int overlap = 0;
- int alphabeticTokens = 0;
+ int numUniqueCommonTokens = 0;
+ int numCommonTokens = 0;
+ int numUniqueAlphabeticTokens = 0;
+ int numAlphabeticTokens = 0;
Set<String> commonTokens = commonTokenMap.get(actualLangCode);
for (Map.Entry<String, MutableInt> e : tokens.entrySet()) {
String token = e.getKey();
int count = e.getValue().intValue();
if (AlphaIdeographFilterFactory.isAlphabetic(token.toCharArray())) {
- alphabeticTokens += count;
+ numAlphabeticTokens += count;
+ numUniqueAlphabeticTokens++;
}
if (commonTokens.contains(token)) {
- overlap += count;
+ numCommonTokens += count;
+ numUniqueCommonTokens++;
}
}
- return new CommonTokenResult(actualLangCode, overlap, alphabeticTokens);
+ return new CommonTokenResult(actualLangCode, numUniqueCommonTokens,
+ numCommonTokens, numUniqueAlphabeticTokens, numAlphabeticTokens);
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
index 317697a..f146153 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
@@ -20,12 +20,17 @@ package org.apache.tika.eval.tokens;
public class CommonTokenResult {
private final String langCode;
+ private final int uniqueCommonTokens;//types
private final int commonTokens;
+ private final int uniqueAlphabeticTokens;
private final int alphabeticTokens;
- public CommonTokenResult(String langCode, int commonTokens, int alphabeticTokens) {
+ public CommonTokenResult(String langCode, int uniqueCommonTokens, int commonTokens,
+ int uniqueAlphabeticTokens, int alphabeticTokens) {
this.langCode = langCode;
+ this.uniqueCommonTokens = uniqueCommonTokens;
this.commonTokens = commonTokens;
+ this.uniqueAlphabeticTokens = uniqueAlphabeticTokens;
this.alphabeticTokens = alphabeticTokens;
}
@@ -33,12 +38,32 @@ public class CommonTokenResult {
return langCode;
}
+ /**
+ *
+ * @return total number of "common tokens"
+ */
public int getCommonTokens() {
return commonTokens;
}
/**
*
+ * @return number of unique "common tokens" (types)
+ */
+ public int getUniqueCommonTokens() {
+ return uniqueCommonTokens;
+ }
+
+ /**
+ *
+ * @return number of unique alphabetic tokens (types)
+ */
+ public int getUniqueAlphabeticTokens() {
+ return uniqueAlphabeticTokens;
+ }
+
+ /**
+ *
* @return number of tokens that had at least one alphabetic/ideographic character
* whether or not a common token
*/
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index 10fd9e3..eaf3bb6 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -765,8 +765,8 @@
limit 100000
</sql>
</report>
- <report reportName="contentDiffsIgnoreExceptions"
- reportFilename="content/content_diffs_ignore_exceptions.xlsx"
+ <report reportName="contentDiffsNoExceptions"
+ reportFilename="content/content_diffs_no_exceptions.xlsx"
format="xlsx"
includeSql="true">
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index 78a8ca5..d54d41c 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -96,7 +96,9 @@ public class SimpleComparerTest extends TikaTest {
assertEquals("70", row.get(Cols.CONTENT_LENGTH));
assertEquals("10", row.get(Cols.NUM_UNIQUE_TOKENS));
assertEquals("14", row.get(Cols.NUM_TOKENS));
+ assertEquals("8", row.get(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS));
assertEquals("12", row.get(Cols.NUM_ALPHABETIC_TOKENS));
+ assertEquals("3", row.get(Cols.NUM_UNIQUE_COMMON_TOKENS));
assertEquals("6", row.get(Cols.NUM_COMMON_TOKENS));
assertEquals("57", row.get(Cols.TOKEN_LENGTH_SUM));
assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
@@ -106,6 +108,7 @@ public class SimpleComparerTest extends TikaTest {
assertEquals("76", row.get(Cols.CONTENT_LENGTH));
assertEquals("9", row.get(Cols.NUM_UNIQUE_TOKENS));
assertEquals("13", row.get(Cols.NUM_TOKENS));
+ assertEquals("3", row.get(Cols.NUM_UNIQUE_COMMON_TOKENS));
assertEquals("4", row.get(Cols.NUM_COMMON_TOKENS));
assertEquals("64", row.get(Cols.TOKEN_LENGTH_SUM));
assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));