You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/08 20:05:07 UTC

(tika) branch main updated: TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter (#1582)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 16e1bc9c8 TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter (#1582)
16e1bc9c8 is described below

commit 16e1bc9c8e4f5e253fc519a477da92410730d060
Author: Tim Allison <ta...@apache.org>
AuthorDate: Thu Feb 8 15:05:02 2024 -0500

    TIKA-4193 -- add num common tokens to TikaEvalMetadataFilter (#1582)
---
 .../org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java    | 4 ++++
 .../apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java    | 1 +
 2 files changed, 5 insertions(+)

diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java
index 0ac65d240..811958af4 100644
--- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java
+++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java
@@ -48,6 +48,9 @@ public class TikaEvalMetadataFilter extends MetadataFilter {
     public static Property NUM_ALPHA_TOKENS =
             Property.externalInteger(TIKA_EVAL_NS + "numAlphaTokens");
 
+    public static Property NUM_COMMON_TOKENS =
+            Property.externalInteger(TIKA_EVAL_NS + "numCommonTokens");
+
     public static Property NUM_UNIQUE_ALPHA_TOKENS =
             Property.externalInteger(TIKA_EVAL_NS + "numUniqueAlphaTokens");
 
@@ -90,6 +93,7 @@ public class TikaEvalMetadataFilter extends MetadataFilter {
         CommonTokenResult commonTokenResult = (CommonTokenResult) results.get(CommonTokens.class);
         metadata.set(NUM_ALPHA_TOKENS, commonTokenResult.getAlphabeticTokens());
         metadata.set(NUM_UNIQUE_ALPHA_TOKENS, commonTokenResult.getUniqueAlphabeticTokens());
+        metadata.set(NUM_COMMON_TOKENS, commonTokenResult.getCommonTokens());
         if (commonTokenResult.getAlphabeticTokens() > 0) {
             metadata.set(OUT_OF_VOCABULARY, commonTokenResult.getOOV());
         } else {
diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java
index 1961698b4..f1fd21c21 100644
--- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java
+++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java
@@ -42,6 +42,7 @@ public class TikaEvalMetadataFilterTest {
             assertEquals(11, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_TOKENS));
             assertEquals(10, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_ALPHA_TOKENS));
             assertEquals(9, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_ALPHA_TOKENS));
+            assertEquals(9, (int) metadata.getInt(TikaEvalMetadataFilter.NUM_COMMON_TOKENS));
 
 
             assertEquals(0.0999,