You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/17 17:47:06 UTC
[tika] 03/06: TIKA-3140 -- initial commit
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3388d28f0276b50ba5accb2fe9daad3cc2152d6d
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 16 17:27:06 2020 -0400
TIKA-3140 -- initial commit
---
.../tika/eval/metadata/TikaEvalMetadataFilter.java | 104 +++++++++++++++++++++
.../eval/metadata/TikaEvalMetadataFilterTest.java | 51 ++++++++++
2 files changed, 155 insertions(+)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
new file mode 100644
index 0000000..2c69801
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.metadata;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.tika.eval.langid.Language;
+import org.apache.tika.eval.langid.LanguageIDWrapper;
+import org.apache.tika.eval.textstats.BasicTokenCountStatsCalculator;
+import org.apache.tika.eval.textstats.CommonTokens;
+import org.apache.tika.eval.textstats.CompositeTextStatsCalculator;
+import org.apache.tika.eval.textstats.TextStatsCalculator;
+import org.apache.tika.eval.tokens.CommonTokenResult;
+import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class TikaEvalMetadataFilter implements MetadataFilter {
+
+ public static String TIKA_EVAL_NS = "tika-eval"+ TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+ public static Property NUM_TOKENS = Property.externalInteger(TIKA_EVAL_NS+"numTokens");
+
+ public static Property NUM_UNIQUE_TOKENS =
+ Property.externalInteger(TIKA_EVAL_NS+"numUniqueTokens");
+
+ public static Property NUM_ALPHA_TOKENS = Property.externalInteger(TIKA_EVAL_NS+"numAlphaTokens");
+
+ public static Property NUM_UNIQUE_ALPHA_TOKENS =
+ Property.externalInteger(TIKA_EVAL_NS+"numUniqueAlphaTokens");
+
+ public static Property LANGUAGE = Property.externalText(TIKA_EVAL_NS+"lang");
+
+ public static Property LANGUAGE_CONFIDENCE = Property.externalReal(TIKA_EVAL_NS+"langConfidence");
+
+ public static Property OUT_OF_VOCABULARY = Property.externalReal(TIKA_EVAL_NS+"oov");
+
+
+ static CompositeTextStatsCalculator TEXT_STATS_CALCULATOR;
+ static {
+ List<TextStatsCalculator> calcs = new ArrayList<>();
+ calcs.add(new BasicTokenCountStatsCalculator());
+ calcs.add(new CommonTokens());
+ TEXT_STATS_CALCULATOR = new CompositeTextStatsCalculator(calcs);
+ }
+
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ String content = metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+ if (StringUtils.isAllBlank(content)) {
+ return;
+ }
+ calcStats(content, metadata);
+ }
+
+ private void calcStats(String content, Metadata metadata) {
+ Map<Class, Object> results = TEXT_STATS_CALCULATOR.calculate(content);
+
+ TokenCounts tokenCounts = (TokenCounts)results.get(BasicTokenCountStatsCalculator.class);
+ metadata.set(NUM_TOKENS, tokenCounts.getTotalTokens());
+ metadata.set(NUM_UNIQUE_TOKENS, tokenCounts.getTotalUniqueTokens());
+
+
+ //common token results
+ CommonTokenResult commonTokenResult = (CommonTokenResult)results.get(CommonTokens.class);
+ metadata.set(NUM_ALPHA_TOKENS, commonTokenResult.getAlphabeticTokens());
+ metadata.set(NUM_UNIQUE_ALPHA_TOKENS, commonTokenResult.getUniqueAlphabeticTokens());
+ if (commonTokenResult.getAlphabeticTokens() > 0) {
+ metadata.set(OUT_OF_VOCABULARY, commonTokenResult.getOOV());
+ } else {
+ metadata.set(OUT_OF_VOCABULARY, -1.0f);
+ }
+
+ //languages
+ List<Language> probabilities = (List<Language>) results.get(LanguageIDWrapper.class);
+ if (probabilities.size() > 0) {
+ metadata.set(LANGUAGE, probabilities.get(0).getLanguage());
+ metadata.set(LANGUAGE_CONFIDENCE, probabilities.get(0).getConfidence());
+ }
+ }
+
+}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java b/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java
new file mode 100644
index 0000000..1b3d006
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.metadata;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class TikaEvalMetadataFilterTest {
+
+ @Test
+ public void testBasic() throws Exception {
+ Metadata metadata = new Metadata();
+ String content = "the quick brown fox, Zothro 1234 1235, jumped over the lazy dog";
+ metadata.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content);
+ TikaEvalMetadataFilter filter = new TikaEvalMetadataFilter();
+ filter.filter(metadata);
+ assertEquals("eng", metadata.get(TikaEvalMetadataFilter.LANGUAGE));
+ assertEquals(12, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_TOKENS));
+ assertEquals(11, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_TOKENS));
+ assertEquals(10, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_ALPHA_TOKENS));
+ assertEquals(9, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_ALPHA_TOKENS));
+
+
+ assertEquals(0.0999,
+ Double.parseDouble(metadata.get(TikaEvalMetadataFilter.OUT_OF_VOCABULARY)),
+ 0.1);
+ assertEquals("eng", metadata.get(TikaEvalMetadataFilter.LANGUAGE));
+
+ assertEquals(0.0196,
+ Double.parseDouble(metadata.get(TikaEvalMetadataFilter.LANGUAGE_CONFIDENCE)),
+ 0.1);
+
+ }
+}