You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/24 21:13:09 UTC

[tika] 01/03: TIKA-3146 -- add Nutch's TextProfileSignature to tika-eval

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 0c69e21031cf2a862430e88fba00637d1d10010c
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 24 14:56:14 2020 -0400

    TIKA-3146 -- add Nutch's TextProfileSignature to tika-eval
---
 .../tika/eval/textstats/TextProfileSignature.java  | 114 +++++++++++++++++++++
 .../apache/tika/eval/textstats/TextStatsTest.java  |  65 ++++++++++++
 2 files changed, 179 insertions(+)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
new file mode 100644
index 0000000..b72cc99
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.textstats;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.tika.eval.tokens.TokenCounts;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Copied nearly directly from Apache Nutch:
+ * https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/TextProfileSignature.java
+ *
+ * See documentation: https://nutch.apache.org/apidocs/apidocs-2.0/org/apache/nutch/crawl/TextProfileSignature.html
+ */
+public class TextProfileSignature implements TokenCountStatsCalculator<String> {
+
+    int minTokenLength = 2;
+    float quantRate = 0.01f;
+    boolean secondaryLexicographicSorting = true;
+
+    Base64 base64 = new Base64();
+
+    @Override
+    public String calculate(TokenCounts tokenCounts) {
+        int maxFreq = -1;
+        for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){
+            if (e.getKey().length() > minTokenLength) {
+                if (e.getValue().intValue() > maxFreq) {
+                    maxFreq = e.getValue().intValue();
+                }
+            }
+        }
+
+        int quant = Math.round(maxFreq * quantRate);
+        if (quant < 2) {
+            if (maxFreq > 1) {
+                quant = 2;
+            } else {
+                quant = 1;
+            }
+        }
+
+        List<Token> profile = new ArrayList<>();
+        for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){
+            profile.add(new Token((e.getValue().intValue()/quant)*quant, e.getKey()));
+        }
+        Collections.sort(profile, new TokenComparator());
+        StringBuffer newText = new StringBuffer();
+        int i = 0;
+        for (Token t : profile) {
+            if (i++ > 0) {
+                newText.append("\n");
+            }
+            newText.append(t.val);
+        }
+        return base64.encodeAsString(DigestUtils.sha256(newText.toString()));
+    }
+
+    public void setMinTokenLength(int minTokenLength) {
+        this.minTokenLength = minTokenLength;
+    }
+
+    public void setQuantRate(float quantRate) {
+        this.quantRate = quantRate;
+    }
+    private static class Token {
+        public int cnt;
+        public String val;
+
+        public Token(int cnt, String val) {
+            this.cnt = cnt;
+            this.val = val;
+        }
+
+        public String toString() {
+            return val + " " + cnt;
+        }
+    }
+
+    private class TokenComparator implements Comparator<Token> {
+        /**
+         * Sort tokens first by decreasing frequency and second in lexicographic
+         * (Unicode) order
+         */
+        public int compare(Token t1, Token t2) {
+            int diffCnt = t2.cnt - t1.cnt;
+            if (diffCnt == 0 && secondaryLexicographicSorting) {
+                return t1.val.compareTo(t2.val);
+            }
+            return diffCnt;
+        }
+    }
+}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
new file mode 100644
index 0000000..ceb39ad
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.textstats;
+
+import org.apache.tika.eval.langid.Language;
+import org.apache.tika.eval.langid.LanguageIDWrapper;
+import org.apache.tika.eval.tokens.CommonTokenResult;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+public class TextStatsTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        String txt = "the quick brown fox &&^&%@! 8675309 jumped over the lazy wombat";
+        List<TextStatsCalculator> calcs = new ArrayList<>();
+        calcs.add(new TextProfileSignature());
+        calcs.add(new ContentLengthCalculator());
+        calcs.add(new TokenEntropy());
+        calcs.add(new CommonTokens());
+        CompositeTextStatsCalculator calc = new CompositeTextStatsCalculator(calcs);
+
+        Map<Class, Object> stats = calc.calculate(txt);
+
+
+        CommonTokenResult ctr = (CommonTokenResult)stats.get(CommonTokens.class);
+        assertEquals("eng", ctr.getLangCode());
+        assertEquals( 9, ctr.getAlphabeticTokens());
+        assertEquals( 8, ctr.getCommonTokens());
+        assertEquals( 7, ctr.getUniqueCommonTokens());
+        assertEquals( 8, ctr.getUniqueAlphabeticTokens());
+        assertEquals( 0.11, ctr.getOOV(), 0.02);
+
+
+        assertEquals(63, (int)stats.get(ContentLengthCalculator.class));
+
+        assertEquals(3.12, (double)stats.get(TokenEntropy.class), 0.01);
+
+        List<Language> probabilities = (List<Language>) stats.get(LanguageIDWrapper.class);
+        assertEquals("eng", probabilities.get(0).getLanguage());
+        assertEquals(0.01, probabilities.get(1).getConfidence(), 0.01);
+
+        String textProfileSignature = (String)stats.get(TextProfileSignature.class);
+        assertEquals("aKhbjS6iV87VBbf/12OfDCWMBg5aS3Atktl2n4ypg14=", textProfileSignature);
+    }
+}