You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/27 19:30:00 UTC
[tika] branch main updated: TIKA-3147 -- drop tokens below quant
value.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new ca4852d TIKA-3147 -- drop tokens below quant value.
ca4852d is described below
commit ca4852db326445059cdd444dcda0d5d2ac414e23
Author: tballison <ta...@apache.org>
AuthorDate: Mon Jul 27 15:29:46 2020 -0400
TIKA-3147 -- drop tokens below quant value.
---
.../java/org/apache/tika/eval/textstats/TextProfileSignature.java | 4 ++++
.../src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java | 2 +-
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
index a1270ca..7628e49 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
@@ -67,6 +67,10 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> {
for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()) {
String token = e.getKey();
if (token.length() >= minTokenLength) {
+ int quantCnt = (e.getValue().intValue() / quant) * quant;
+ if (quantCnt < quant) {
+ continue;
+ }
profile.add(new Token((e.getValue().intValue() / quant) * quant, e.getKey()));
}
}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
index 6290150..486791b 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -65,7 +65,7 @@ public class TextStatsTest {
assertEquals(0.02, probabilities.get(1).getConfidence(), 0.01);
String textProfileSignature = (String)stats.get(TextProfileSignature.class);
- assertEquals("NCUFXDJOUJL45VIFW775OY47BQSYYBQOLJFXALMS3F3J7DFJQNPA====", textProfileSignature);
+ assertEquals("XF3W27O7IWOJVVNQ4HLKYYPCPPX3L2M72YSEMZ3WADL4VTXVITIA====", textProfileSignature);
assertEquals(new Base32().encodeAsString(
DigestUtils.sha256(txtCleaned.getBytes(StandardCharsets.UTF_8))),