You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/27 19:30:00 UTC

[tika] branch main updated: TIKA-3147 -- drop tokens below quant value.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new ca4852d  TIKA-3147 -- drop tokens below quant value.
ca4852d is described below

commit ca4852db326445059cdd444dcda0d5d2ac414e23
Author: tballison <ta...@apache.org>
AuthorDate: Mon Jul 27 15:29:46 2020 -0400

    TIKA-3147 -- drop tokens below quant value.
---
 .../java/org/apache/tika/eval/textstats/TextProfileSignature.java     | 4 ++++
 .../src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java   | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
index a1270ca..7628e49 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
@@ -67,6 +67,10 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> {
         for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()) {
             String token = e.getKey();
             if (token.length() >= minTokenLength) {
+                int quantCnt = (e.getValue().intValue() / quant) * quant;
+                if (quantCnt < quant) {
+                    continue;
+                }
                 profile.add(new Token((e.getValue().intValue() / quant) * quant, e.getKey()));
             }
         }
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
index 6290150..486791b 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -65,7 +65,7 @@ public class TextStatsTest {
         assertEquals(0.02, probabilities.get(1).getConfidence(), 0.01);
 
         String textProfileSignature = (String)stats.get(TextProfileSignature.class);
-        assertEquals("NCUFXDJOUJL45VIFW775OY47BQSYYBQOLJFXALMS3F3J7DFJQNPA====", textProfileSignature);
+        assertEquals("XF3W27O7IWOJVVNQ4HLKYYPCPPX3L2M72YSEMZ3WADL4VTXVITIA====", textProfileSignature);
 
         assertEquals(new Base32().encodeAsString(
                 DigestUtils.sha256(txtCleaned.getBytes(StandardCharsets.UTF_8))),