You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/17 17:47:06 UTC

[tika] 03/06: TIKA-3140 -- initial commit

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 3388d28f0276b50ba5accb2fe9daad3cc2152d6d
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 16 17:27:06 2020 -0400

    TIKA-3140 -- initial commit
---
 .../tika/eval/metadata/TikaEvalMetadataFilter.java | 104 +++++++++++++++++++++
 .../eval/metadata/TikaEvalMetadataFilterTest.java  |  51 ++++++++++
 2 files changed, 155 insertions(+)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
new file mode 100644
index 0000000..2c69801
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.metadata;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.tika.eval.langid.Language;
+import org.apache.tika.eval.langid.LanguageIDWrapper;
+import org.apache.tika.eval.textstats.BasicTokenCountStatsCalculator;
+import org.apache.tika.eval.textstats.CommonTokens;
+import org.apache.tika.eval.textstats.CompositeTextStatsCalculator;
+import org.apache.tika.eval.textstats.TextStatsCalculator;
+import org.apache.tika.eval.tokens.CommonTokenResult;
+import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class TikaEvalMetadataFilter implements MetadataFilter {
+
+    public static String TIKA_EVAL_NS = "tika-eval"+ TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+    public static Property NUM_TOKENS = Property.externalInteger(TIKA_EVAL_NS+"numTokens");
+
+    public static Property NUM_UNIQUE_TOKENS =
+            Property.externalInteger(TIKA_EVAL_NS+"numUniqueTokens");
+
+    public static Property NUM_ALPHA_TOKENS = Property.externalInteger(TIKA_EVAL_NS+"numAlphaTokens");
+
+    public static Property NUM_UNIQUE_ALPHA_TOKENS =
+            Property.externalInteger(TIKA_EVAL_NS+"numUniqueAlphaTokens");
+
+    public static Property LANGUAGE = Property.externalText(TIKA_EVAL_NS+"lang");
+
+    public static Property LANGUAGE_CONFIDENCE = Property.externalReal(TIKA_EVAL_NS+"langConfidence");
+
+    public static Property OUT_OF_VOCABULARY = Property.externalReal(TIKA_EVAL_NS+"oov");
+
+
+    static CompositeTextStatsCalculator TEXT_STATS_CALCULATOR;
+    static {
+        List<TextStatsCalculator> calcs = new ArrayList<>();
+        calcs.add(new BasicTokenCountStatsCalculator());
+        calcs.add(new CommonTokens());
+        TEXT_STATS_CALCULATOR = new CompositeTextStatsCalculator(calcs);
+    }
+
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        String content = metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+        if (StringUtils.isAllBlank(content)) {
+            return;
+        }
+        calcStats(content, metadata);
+    }
+
+    private void calcStats(String content, Metadata metadata) {
+        Map<Class, Object> results = TEXT_STATS_CALCULATOR.calculate(content);
+
+        TokenCounts tokenCounts = (TokenCounts)results.get(BasicTokenCountStatsCalculator.class);
+        metadata.set(NUM_TOKENS, tokenCounts.getTotalTokens());
+        metadata.set(NUM_UNIQUE_TOKENS, tokenCounts.getTotalUniqueTokens());
+
+
+        //common token results
+        CommonTokenResult commonTokenResult = (CommonTokenResult)results.get(CommonTokens.class);
+        metadata.set(NUM_ALPHA_TOKENS, commonTokenResult.getAlphabeticTokens());
+        metadata.set(NUM_UNIQUE_ALPHA_TOKENS, commonTokenResult.getUniqueAlphabeticTokens());
+        if (commonTokenResult.getAlphabeticTokens() > 0) {
+            metadata.set(OUT_OF_VOCABULARY, commonTokenResult.getOOV());
+        } else {
+            metadata.set(OUT_OF_VOCABULARY, -1.0f);
+        }
+
+        //languages
+        List<Language> probabilities = (List<Language>) results.get(LanguageIDWrapper.class);
+        if (probabilities.size() > 0) {
+            metadata.set(LANGUAGE, probabilities.get(0).getLanguage());
+            metadata.set(LANGUAGE_CONFIDENCE, probabilities.get(0).getConfidence());
+        }
+    }
+
+}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java b/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java
new file mode 100644
index 0000000..1b3d006
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.metadata;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class TikaEvalMetadataFilterTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = "the quick brown fox, Zothro 1234 1235, jumped over the lazy dog";
+        metadata.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content);
+        TikaEvalMetadataFilter filter = new TikaEvalMetadataFilter();
+        filter.filter(metadata);
+        assertEquals("eng", metadata.get(TikaEvalMetadataFilter.LANGUAGE));
+        assertEquals(12, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_TOKENS));
+        assertEquals(11, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_TOKENS));
+        assertEquals(10, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_ALPHA_TOKENS));
+        assertEquals(9, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_ALPHA_TOKENS));
+
+
+        assertEquals(0.0999,
+                Double.parseDouble(metadata.get(TikaEvalMetadataFilter.OUT_OF_VOCABULARY)),
+                0.1);
+        assertEquals("eng", metadata.get(TikaEvalMetadataFilter.LANGUAGE));
+
+        assertEquals(0.0196,
+                Double.parseDouble(metadata.get(TikaEvalMetadataFilter.LANGUAGE_CONFIDENCE)),
+                0.1);
+
+    }
+}