You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/24 21:13:09 UTC
[tika] 01/03: TIKA-3146 -- add Nutch's TextProfileSignature to
tika-eval
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0c69e21031cf2a862430e88fba00637d1d10010c
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 24 14:56:14 2020 -0400
TIKA-3146 -- add Nutch's TextProfileSignature to tika-eval
---
.../tika/eval/textstats/TextProfileSignature.java | 114 +++++++++++++++++++++
.../apache/tika/eval/textstats/TextStatsTest.java | 65 ++++++++++++
2 files changed, 179 insertions(+)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
new file mode 100644
index 0000000..b72cc99
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.textstats;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.tika.eval.tokens.TokenCounts;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Copied nearly directly from Apache Nutch:
+ * https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/TextProfileSignature.java
+ *
+ * See documentation: https://nutch.apache.org/apidocs/apidocs-2.0/org/apache/nutch/crawl/TextProfileSignature.html
+ */
+public class TextProfileSignature implements TokenCountStatsCalculator<String> {
+
+ int minTokenLength = 2;
+ float quantRate = 0.01f;
+ boolean secondaryLexicographicSorting = true;
+
+ Base64 base64 = new Base64();
+
+ @Override
+ public String calculate(TokenCounts tokenCounts) {
+ int maxFreq = -1;
+ for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){
+ if (e.getKey().length() > minTokenLength) {
+ if (e.getValue().intValue() > maxFreq) {
+ maxFreq = e.getValue().intValue();
+ }
+ }
+ }
+
+ int quant = Math.round(maxFreq * quantRate);
+ if (quant < 2) {
+ if (maxFreq > 1) {
+ quant = 2;
+ } else {
+ quant = 1;
+ }
+ }
+
+ List<Token> profile = new ArrayList<>();
+ for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){
+ profile.add(new Token((e.getValue().intValue()/quant)*quant, e.getKey()));
+ }
+ Collections.sort(profile, new TokenComparator());
+ StringBuffer newText = new StringBuffer();
+ int i = 0;
+ for (Token t : profile) {
+ if (i++ > 0) {
+ newText.append("\n");
+ }
+ newText.append(t.val);
+ }
+ return base64.encodeAsString(DigestUtils.sha256(newText.toString()));
+ }
+
+ public void setMinTokenLength(int minTokenLength) {
+ this.minTokenLength = minTokenLength;
+ }
+
+ public void setQuantRate(float quantRate) {
+ this.quantRate = quantRate;
+ }
+ private static class Token {
+ public int cnt;
+ public String val;
+
+ public Token(int cnt, String val) {
+ this.cnt = cnt;
+ this.val = val;
+ }
+
+ public String toString() {
+ return val + " " + cnt;
+ }
+ }
+
+ private class TokenComparator implements Comparator<Token> {
+ /**
+ * Sort tokens first by decreasing frequency and second in lexicographic
+ * (Unicode) order
+ */
+ public int compare(Token t1, Token t2) {
+ int diffCnt = t2.cnt - t1.cnt;
+ if (diffCnt == 0 && secondaryLexicographicSorting) {
+ return t1.val.compareTo(t2.val);
+ }
+ return diffCnt;
+ }
+ }
+}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
new file mode 100644
index 0000000..ceb39ad
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.textstats;
+
+import org.apache.tika.eval.langid.Language;
+import org.apache.tika.eval.langid.LanguageIDWrapper;
+import org.apache.tika.eval.tokens.CommonTokenResult;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+public class TextStatsTest {
+
+ @Test
+ public void testBasic() throws Exception {
+ String txt = "the quick brown fox &&^&%@! 8675309 jumped over the lazy wombat";
+ List<TextStatsCalculator> calcs = new ArrayList<>();
+ calcs.add(new TextProfileSignature());
+ calcs.add(new ContentLengthCalculator());
+ calcs.add(new TokenEntropy());
+ calcs.add(new CommonTokens());
+ CompositeTextStatsCalculator calc = new CompositeTextStatsCalculator(calcs);
+
+ Map<Class, Object> stats = calc.calculate(txt);
+
+
+ CommonTokenResult ctr = (CommonTokenResult)stats.get(CommonTokens.class);
+ assertEquals("eng", ctr.getLangCode());
+ assertEquals( 9, ctr.getAlphabeticTokens());
+ assertEquals( 8, ctr.getCommonTokens());
+ assertEquals( 7, ctr.getUniqueCommonTokens());
+ assertEquals( 8, ctr.getUniqueAlphabeticTokens());
+ assertEquals( 0.11, ctr.getOOV(), 0.02);
+
+
+ assertEquals(63, (int)stats.get(ContentLengthCalculator.class));
+
+ assertEquals(3.12, (double)stats.get(TokenEntropy.class), 0.01);
+
+ List<Language> probabilities = (List<Language>) stats.get(LanguageIDWrapper.class);
+ assertEquals("eng", probabilities.get(0).getLanguage());
+ assertEquals(0.01, probabilities.get(1).getConfidence(), 0.01);
+
+ String textProfileSignature = (String)stats.get(TextProfileSignature.class);
+ assertEquals("aKhbjS6iV87VBbf/12OfDCWMBg5aS3Atktl2n4ypg14=", textProfileSignature);
+ }
+}