You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/24 21:13:08 UTC

[tika] branch main updated (839d318 -> ed0c91f)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 839d318  Improve unit test to ensure that the CompressorParser is not called
     new 0c69e21  TIKA-3146 -- add Nutch's TextProfileSignature to tika-eval
     new 6d58ea5  TIKA-3145 -- add TextSha256Signature
     new ed0c91f  TIKA-3146 -- clean up text profile signature and add unit test for cjk

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 ...tatsCalculator.java => BytesRefCalculator.java} |  12 +-
 .../textstats/CompositeTextStatsCalculator.java    |  74 ++++++++++--
 .../tika/eval/textstats/TextProfileSignature.java  | 126 +++++++++++++++++++++
 .../tika/eval/textstats/TextSha256Signature.java   |  54 +++++++++
 .../apache/tika/eval/textstats/TextStatsTest.java  | 105 +++++++++++++++++
 5 files changed, 357 insertions(+), 14 deletions(-)
 copy tika-eval/src/main/java/org/apache/tika/eval/textstats/{StringStatsCalculator.java => BytesRefCalculator.java} (77%)
 create mode 100644 tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
 create mode 100644 tika-eval/src/main/java/org/apache/tika/eval/textstats/TextSha256Signature.java
 create mode 100644 tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java


[tika] 03/03: TIKA-3146 -- clean up text profile signature and add unit test for cjk

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ed0c91fddbcce94af47cb8bedf77d3eec74b3fa0
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 24 17:12:37 2020 -0400

    TIKA-3146 -- clean up text profile signature and add unit test for cjk
---
 .../tika/eval/textstats/TextProfileSignature.java  | 16 ++++++++---
 .../apache/tika/eval/textstats/TextStatsTest.java  | 31 ++++++++++++++++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
index bc65351..a1270ca 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
@@ -47,7 +47,7 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> {
     public String calculate(TokenCounts tokenCounts) {
         int maxFreq = -1;
         for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){
-            if (e.getKey().length() > minTokenLength) {
+            if (e.getKey().length() >= minTokenLength) {
                 if (e.getValue().intValue() > maxFreq) {
                     maxFreq = e.getValue().intValue();
                 }
@@ -64,8 +64,11 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> {
         }
 
         List<Token> profile = new ArrayList<>();
-        for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){
-            profile.add(new Token((e.getValue().intValue()/quant)*quant, e.getKey()));
+        for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()) {
+            String token = e.getKey();
+            if (token.length() >= minTokenLength) {
+                profile.add(new Token((e.getValue().intValue() / quant) * quant, e.getKey()));
+            }
         }
         Collections.sort(profile, new TokenComparator());
         StringBuffer newText = new StringBuffer();
@@ -79,6 +82,13 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> {
         return base32.encodeAsString(DigestUtils.sha256(newText.toString()));
     }
 
+    /**
+     * Be careful -- for CJK languages, the default analyzer uses character
+     * bigrams.  You will "ignore" all cjk language tokens if you set
+     * minTokenLength > 2!
+     *
+     * @param minTokenLength -- include tokens of this length or greater.
+     */
     public void setMinTokenLength(int minTokenLength) {
         this.minTokenLength = minTokenLength;
     }
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
index 8206977..a2252b6 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -71,4 +71,35 @@ public class TextStatsTest {
                 DigestUtils.sha256(txtCleaned.getBytes(StandardCharsets.UTF_8))),
                 stats.get(TextSha256Signature.class));
     }
+
+    @Test
+    public void testCJK() throws Exception {
+        String txt = "普林斯顿大学";
+        List<TextStatsCalculator> calcs = new ArrayList<>();
+        calcs.add(new TextProfileSignature());
+        CompositeTextStatsCalculator calc = new CompositeTextStatsCalculator(calcs);
+
+        Map<Class, Object> stats = calc.calculate(txt);
+        String textProfileSignature = (String)stats.get(TextProfileSignature.class);
+        assertEquals("XKXLY6FNIGK2KGEF6HOSKSVGYDLLOFIAGO73RLMJ22PZVXBTXFFA====", textProfileSignature);
+
+        //now test that if a user accidentally sets mintoken length > 2
+        //the output will the be same as empty text
+        calcs.clear();
+        calcs.add(new TextProfileSignature());
+        calc = new CompositeTextStatsCalculator(calcs);
+
+        stats = calc.calculate("");
+        String emptyStringSignature = (String)stats.get(TextProfileSignature.class);
+
+        calcs.clear();
+        TextProfileSignature tPs = new TextProfileSignature();
+        tPs.setMinTokenLength(3);
+        calcs.add(tPs);
+        calc = new CompositeTextStatsCalculator(calcs);
+
+        stats = calc.calculate(txt);
+        assertEquals(emptyStringSignature, (String)stats.get(TextProfileSignature.class));
+
+    }
 }


[tika] 01/03: TIKA-3146 -- add Nutch's TextProfileSignature to tika-eval

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 0c69e21031cf2a862430e88fba00637d1d10010c
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 24 14:56:14 2020 -0400

    TIKA-3146 -- add Nutch's TextProfileSignature to tika-eval
---
 .../tika/eval/textstats/TextProfileSignature.java  | 114 +++++++++++++++++++++
 .../apache/tika/eval/textstats/TextStatsTest.java  |  65 ++++++++++++
 2 files changed, 179 insertions(+)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
new file mode 100644
index 0000000..b72cc99
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.textstats;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.tika.eval.tokens.TokenCounts;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Copied nearly directly from Apache Nutch:
+ * https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/TextProfileSignature.java
+ *
+ * See documentation: https://nutch.apache.org/apidocs/apidocs-2.0/org/apache/nutch/crawl/TextProfileSignature.html
+ */
+public class TextProfileSignature implements TokenCountStatsCalculator<String> {
+
+    int minTokenLength = 2;
+    float quantRate = 0.01f;
+    boolean secondaryLexicographicSorting = true;
+
+    Base64 base64 = new Base64();
+
+    @Override
+    public String calculate(TokenCounts tokenCounts) {
+        int maxFreq = -1;
+        for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){
+            if (e.getKey().length() > minTokenLength) {
+                if (e.getValue().intValue() > maxFreq) {
+                    maxFreq = e.getValue().intValue();
+                }
+            }
+        }
+
+        int quant = Math.round(maxFreq * quantRate);
+        if (quant < 2) {
+            if (maxFreq > 1) {
+                quant = 2;
+            } else {
+                quant = 1;
+            }
+        }
+
+        List<Token> profile = new ArrayList<>();
+        for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){
+            profile.add(new Token((e.getValue().intValue()/quant)*quant, e.getKey()));
+        }
+        Collections.sort(profile, new TokenComparator());
+        StringBuffer newText = new StringBuffer();
+        int i = 0;
+        for (Token t : profile) {
+            if (i++ > 0) {
+                newText.append("\n");
+            }
+            newText.append(t.val);
+        }
+        return base64.encodeAsString(DigestUtils.sha256(newText.toString()));
+    }
+
+    public void setMinTokenLength(int minTokenLength) {
+        this.minTokenLength = minTokenLength;
+    }
+
+    public void setQuantRate(float quantRate) {
+        this.quantRate = quantRate;
+    }
+    private static class Token {
+        public int cnt;
+        public String val;
+
+        public Token(int cnt, String val) {
+            this.cnt = cnt;
+            this.val = val;
+        }
+
+        public String toString() {
+            return val + " " + cnt;
+        }
+    }
+
+    private class TokenComparator implements Comparator<Token> {
+        /**
+         * Sort tokens first by decreasing frequency and second in lexicographic
+         * (Unicode) order
+         */
+        public int compare(Token t1, Token t2) {
+            int diffCnt = t2.cnt - t1.cnt;
+            if (diffCnt == 0 && secondaryLexicographicSorting) {
+                return t1.val.compareTo(t2.val);
+            }
+            return diffCnt;
+        }
+    }
+}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
new file mode 100644
index 0000000..ceb39ad
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.textstats;
+
+import org.apache.tika.eval.langid.Language;
+import org.apache.tika.eval.langid.LanguageIDWrapper;
+import org.apache.tika.eval.tokens.CommonTokenResult;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+
+public class TextStatsTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        String txt = "the quick brown fox &&^&%@! 8675309 jumped over the lazy wombat";
+        List<TextStatsCalculator> calcs = new ArrayList<>();
+        calcs.add(new TextProfileSignature());
+        calcs.add(new ContentLengthCalculator());
+        calcs.add(new TokenEntropy());
+        calcs.add(new CommonTokens());
+        CompositeTextStatsCalculator calc = new CompositeTextStatsCalculator(calcs);
+
+        Map<Class, Object> stats = calc.calculate(txt);
+
+
+        CommonTokenResult ctr = (CommonTokenResult)stats.get(CommonTokens.class);
+        assertEquals("eng", ctr.getLangCode());
+        assertEquals( 9, ctr.getAlphabeticTokens());
+        assertEquals( 8, ctr.getCommonTokens());
+        assertEquals( 7, ctr.getUniqueCommonTokens());
+        assertEquals( 8, ctr.getUniqueAlphabeticTokens());
+        assertEquals( 0.11, ctr.getOOV(), 0.02);
+
+
+        assertEquals(63, (int)stats.get(ContentLengthCalculator.class));
+
+        assertEquals(3.12, (double)stats.get(TokenEntropy.class), 0.01);
+
+        List<Language> probabilities = (List<Language>) stats.get(LanguageIDWrapper.class);
+        assertEquals("eng", probabilities.get(0).getLanguage());
+        assertEquals(0.01, probabilities.get(1).getConfidence(), 0.01);
+
+        String textProfileSignature = (String)stats.get(TextProfileSignature.class);
+        assertEquals("aKhbjS6iV87VBbf/12OfDCWMBg5aS3Atktl2n4ypg14=", textProfileSignature);
+    }
+}


[tika] 02/03: TIKA-3145 -- add TextSha256Signature

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6d58ea584cf327cc28af1e3fc338094cc70416d2
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 24 16:59:04 2020 -0400

    TIKA-3145 -- add TextSha256Signature
---
 .../tika/eval/textstats/BytesRefCalculator.java    | 33 ++++++++++
 .../textstats/CompositeTextStatsCalculator.java    | 74 ++++++++++++++++++----
 .../tika/eval/textstats/TextProfileSignature.java  |  8 ++-
 .../tika/eval/textstats/TextSha256Signature.java   | 54 ++++++++++++++++
 .../apache/tika/eval/textstats/TextStatsTest.java  | 13 +++-
 5 files changed, 165 insertions(+), 17 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/BytesRefCalculator.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/BytesRefCalculator.java
new file mode 100644
index 0000000..048b798
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/BytesRefCalculator.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.textstats;
+
+/**
+ * Interface for calculators that require a string
+ * @param <T>
+ */
+public interface BytesRefCalculator<T> extends TextStatsCalculator {
+
+    public BytesRefCalcInstance<T> getInstance();
+
+    interface BytesRefCalcInstance<T> {
+        void update(byte[] bytes, int start, int len);
+        T finish();
+        Class getOuterClass();
+    }
+
+}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
index 2c7c673..a16c767 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
@@ -17,29 +17,38 @@
 package org.apache.tika.eval.textstats;
 
 import java.io.IOException;
+import java.security.MessageDigest;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.util.BytesRef;
 import org.apache.tika.eval.langid.Language;
 import org.apache.tika.eval.langid.LanguageIDWrapper;
 import org.apache.tika.eval.tokens.AnalyzerManager;
 import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.metadata.Message;
 
 
 public class CompositeTextStatsCalculator {
 
     private static final String FIELD = "f";
     private static final int DEFAULT_MAX_TOKENS = 10_000_000;
+    private final byte[] whitespace = new byte[]{' '};
     private final Analyzer analyzer;
     private final LanguageIDWrapper languageIDWrapper;
     private final List<LanguageAwareTokenCountStats> languageAwareTokenCountStats = new ArrayList<>();
     private final List<TokenCountStatsCalculator> tokenCountStatCalculators = new ArrayList<>();
     private final List<StringStatsCalculator> stringStatCalculators = new ArrayList<>();
+    private final List<BytesRefCalculator> bytesRefCalculators = new ArrayList<>();
 
     public CompositeTextStatsCalculator(List<TextStatsCalculator> calculators) {
         this(calculators,
@@ -68,6 +77,14 @@ public class CompositeTextStatsCalculator {
                                     "a TokenCountStats: "+t.getClass()
                     );
                 }
+            } else if (t instanceof BytesRefCalculator) {
+                bytesRefCalculators.add((BytesRefCalculator)t);
+                if (analyzer == null) {
+                    throw new IllegalArgumentException(
+                            "Analyzer must not be null if you are using "+
+                                    "a BytesRefCalculator: "+t.getClass()
+                    );
+                }
             } else {
                 throw new IllegalArgumentException(
                         "I regret I don't yet handle: "+t.getClass()
@@ -83,9 +100,11 @@ public class CompositeTextStatsCalculator {
         }
 
         TokenCounts tokenCounts = null;
-        if (tokenCountStatCalculators.size() > 0 || languageAwareTokenCountStats.size() > 0) {
+        if (tokenCountStatCalculators.size() > 0
+                || languageAwareTokenCountStats.size() > 0
+                || bytesRefCalculators.size() > 0) {
             try {
-                tokenCounts = tokenize(txt);
+                tokenCounts = tokenize(txt, results);
             } catch (IOException e) {
                 throw new RuntimeException(e);
             }
@@ -106,20 +125,51 @@ public class CompositeTextStatsCalculator {
         return results;
     }
 
-    private TokenCounts tokenize(String txt) throws IOException  {
+    private TokenCounts tokenize(String txt, Map<Class, Object> results) throws IOException  {
         TokenCounts counts = new TokenCounts();
         TokenStream ts = analyzer.tokenStream(FIELD, txt);
-        try {
-            CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
-            ts.reset();
-            while (ts.incrementToken()) {
-                String token = termAtt.toString();
-                counts.increment(token);
+        if (bytesRefCalculators.size() == 0) {
+            try {
+                CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+                ts.reset();
+                while (ts.incrementToken()) {
+                    String token = termAtt.toString();
+                    counts.increment(token);
+                }
+            } finally {
+                ts.close();
+                ts.end();
+            }
+        } else {
+            List<BytesRefCalculator.BytesRefCalcInstance> brcis = new ArrayList<>();
+            for (BytesRefCalculator brf : bytesRefCalculators) {
+                brcis.add(brf.getInstance());
+            }
+            try {
+                TermToBytesRefAttribute termAtt = ts.getAttribute(TermToBytesRefAttribute.class);
+                ts.reset();
+                int i = 0;
+                while (ts.incrementToken()) {
+                    final BytesRef bytesRef = termAtt.getBytesRef();
+                    String token = termAtt.toString();
+                    counts.increment(token);
+                    for (BytesRefCalculator.BytesRefCalcInstance brci : brcis) {
+                        if (i > 0) {
+                            brci.update(whitespace, 0, 1);
+                        }
+                        brci.update(bytesRef.bytes, bytesRef.offset, bytesRef.length);
+                    }
+                    i++;
+                }
+                for (BytesRefCalculator.BytesRefCalcInstance brc : brcis) {
+                    results.put(brc.getOuterClass(), brc.finish());
+                }
+            } finally {
+                ts.close();
+                ts.end();
             }
-        } finally {
-            ts.close();
-            ts.end();
         }
+
         return counts;
     }
 }
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
index b72cc99..bc65351 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
@@ -16,7 +16,7 @@
  */
 package org.apache.tika.eval.textstats;
 
-import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.binary.Base32;
 import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.lang3.mutable.MutableInt;
 import org.apache.tika.eval.tokens.TokenCounts;
@@ -32,6 +32,8 @@ import java.util.Map;
  * https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/TextProfileSignature.java
  *
  * See documentation: https://nutch.apache.org/apidocs/apidocs-2.0/org/apache/nutch/crawl/TextProfileSignature.html
+ *
+ * This returns the base32 encoded sha256
  */
 public class TextProfileSignature implements TokenCountStatsCalculator<String> {
 
@@ -39,7 +41,7 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> {
     float quantRate = 0.01f;
     boolean secondaryLexicographicSorting = true;
 
-    Base64 base64 = new Base64();
+    Base32 base32 = new Base32();
 
     @Override
     public String calculate(TokenCounts tokenCounts) {
@@ -74,7 +76,7 @@ public class TextProfileSignature implements TokenCountStatsCalculator<String> {
             }
             newText.append(t.val);
         }
-        return base64.encodeAsString(DigestUtils.sha256(newText.toString()));
+        return base32.encodeAsString(DigestUtils.sha256(newText.toString()));
     }
 
     public void setMinTokenLength(int minTokenLength) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextSha256Signature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextSha256Signature.java
new file mode 100644
index 0000000..eab4e08
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextSha256Signature.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.textstats;
+
+import org.apache.commons.codec.binary.Base32;
+import org.apache.commons.codec.digest.DigestUtils;
+
+import java.security.MessageDigest;
+
+/**
+ * Calculates the base32 encoded SHA-256 checksum on the analyzed text
+ */
+public class TextSha256Signature implements BytesRefCalculator<String> {
+
+    static Base32 BASE32 = new Base32();
+
+    @Override
+    public BytesRefCalcInstance<String> getInstance() {
+        return new TextSha256Instance();
+    }
+
+    class TextSha256Instance implements BytesRefCalcInstance<String> {
+        private MessageDigest messageDigest = DigestUtils.getSha256Digest();
+        @Override
+        public void update(byte[] bytes, int start, int len) {
+            messageDigest.update(bytes, start, len);
+
+        }
+
+        @Override
+        public String finish() {
+            return BASE32.encodeAsString(messageDigest.digest());
+        }
+
+        @Override
+        public Class getOuterClass() {
+            return TextSha256Signature.class;
+        }
+    }
+}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
index ceb39ad..8206977 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -16,11 +16,14 @@
  */
 package org.apache.tika.eval.textstats;
 
+import org.apache.commons.codec.binary.Base32;
+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.tika.eval.langid.Language;
 import org.apache.tika.eval.langid.LanguageIDWrapper;
 import org.apache.tika.eval.tokens.CommonTokenResult;
 import org.junit.Test;
 
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@@ -31,12 +34,14 @@ public class TextStatsTest {
 
     @Test
     public void testBasic() throws Exception {
-        String txt = "the quick brown fox &&^&%@! 8675309 jumped over the lazy wombat";
+        String txt = "The quick brown fox &&^&%@! 8675309 jumped over tHe lazy wombat";
+        String txtCleaned = "the quick brown fox 8675309 jumped over the lazy wombat";
         List<TextStatsCalculator> calcs = new ArrayList<>();
         calcs.add(new TextProfileSignature());
         calcs.add(new ContentLengthCalculator());
         calcs.add(new TokenEntropy());
         calcs.add(new CommonTokens());
+        calcs.add(new TextSha256Signature());
         CompositeTextStatsCalculator calc = new CompositeTextStatsCalculator(calcs);
 
         Map<Class, Object> stats = calc.calculate(txt);
@@ -60,6 +65,10 @@ public class TextStatsTest {
         assertEquals(0.01, probabilities.get(1).getConfidence(), 0.01);
 
         String textProfileSignature = (String)stats.get(TextProfileSignature.class);
-        assertEquals("aKhbjS6iV87VBbf/12OfDCWMBg5aS3Atktl2n4ypg14=", textProfileSignature);
+        assertEquals("NCUFXDJOUJL45VIFW775OY47BQSYYBQOLJFXALMS3F3J7DFJQNPA====", textProfileSignature);
+
+        assertEquals(new Base32().encodeAsString(
+                DigestUtils.sha256(txtCleaned.getBytes(StandardCharsets.UTF_8))),
+                stats.get(TextSha256Signature.class));
     }
 }