You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/27 16:54:25 UTC
[tika] branch main updated: TIKA-3147 -- strip punctuation before
language id; fix bug that omitted filters on text before language id.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 7a0f5fc TIKA-3147 -- strip punctuation before language id; fix bug that omitted filters on text before language id.
7a0f5fc is described below
commit 7a0f5fc72f5973cb7100701a886b748a2748e0b9
Author: tballison <ta...@apache.org>
AuthorDate: Mon Jul 27 12:53:59 2020 -0400
TIKA-3147 -- strip punctuation before language id; fix bug
that omitted filters on text before language id.
---
.../apache/tika/eval/langid/LanguageIDWrapper.java | 22 ++++++++++++++++++++--
.../apache/tika/eval/textstats/TextStatsTest.java | 13 ++++++++++---
2 files changed, 30 insertions(+), 5 deletions(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java b/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java
index fc2ee0c..50f928e 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java
@@ -56,8 +56,9 @@ public class LanguageIDWrapper implements StringStatsCalculator<List<Language>>
private static CharSequenceNormalizer[] getNormalizers() {
return new CharSequenceNormalizer[]{
- EmojiCharSequenceNormalizer.getInstance(),
TikaUrlCharSequenceNormalizer.getInstance(),
+ AlphaIdeographSequenceNormalizer.getInstance(),
+ EmojiCharSequenceNormalizer.getInstance(),
TwitterCharSequenceNormalizer.getInstance(),
NumberCharSequenceNormalizer.getInstance(),
ShrinkCharSequenceNormalizer.getInstance()
@@ -73,7 +74,7 @@ public class LanguageIDWrapper implements StringStatsCalculator<List<Language>>
throw new RuntimeException("couldn't load built in lang models", e);
}
}
- detector = new ProbingLanguageDetector(LANG_MODEL);
+ detector = new ProbingLanguageDetector(LANG_MODEL, getNormalizers());
}
public List<Language> getProbabilities(String s) {
@@ -118,4 +119,21 @@ public class LanguageIDWrapper implements StringStatsCalculator<List<Language>>
return MAIL_REGEX.matcher(modified).replaceAll(" ");
}
}
+
+ private static class AlphaIdeographSequenceNormalizer implements CharSequenceNormalizer {
+ private static final Pattern REGEX = Pattern.compile("[^\\p{IsAlphabetic}\\p{IsIdeographic}]+");
+ private static final AlphaIdeographSequenceNormalizer INSTANCE = new AlphaIdeographSequenceNormalizer();
+
+ public static AlphaIdeographSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ private AlphaIdeographSequenceNormalizer() {
+ }
+
+ @Override
+ public CharSequence normalize(CharSequence charSequence) {
+ return REGEX.matcher(charSequence).replaceAll(" ");
+ }
+ }
}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
index a2252b6..6290150 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -34,7 +34,7 @@ public class TextStatsTest {
@Test
public void testBasic() throws Exception {
- String txt = "The quick brown fox &&^&%@! 8675309 jumped over tHe lazy wombat";
+ String txt = "The quick brown fox &&^&%@! ; ; ; ;;; ;;; 8675309 jumped over tHe lazy wombat";
String txtCleaned = "the quick brown fox 8675309 jumped over the lazy wombat";
List<TextStatsCalculator> calcs = new ArrayList<>();
calcs.add(new TextProfileSignature());
@@ -56,13 +56,13 @@ public class TextStatsTest {
assertEquals( 0.11, ctr.getOOV(), 0.02);
- assertEquals(63, (int)stats.get(ContentLengthCalculator.class));
+ assertEquals(77, (int)stats.get(ContentLengthCalculator.class));
assertEquals(3.12, (double)stats.get(TokenEntropy.class), 0.01);
List<Language> probabilities = (List<Language>) stats.get(LanguageIDWrapper.class);
assertEquals("eng", probabilities.get(0).getLanguage());
- assertEquals(0.01, probabilities.get(1).getConfidence(), 0.01);
+ assertEquals(0.02, probabilities.get(1).getConfidence(), 0.01);
String textProfileSignature = (String)stats.get(TextProfileSignature.class);
assertEquals("NCUFXDJOUJL45VIFW775OY47BQSYYBQOLJFXALMS3F3J7DFJQNPA====", textProfileSignature);
@@ -77,9 +77,16 @@ public class TextStatsTest {
String txt = "普林斯顿大学";
List<TextStatsCalculator> calcs = new ArrayList<>();
calcs.add(new TextProfileSignature());
+ calcs.add(new CommonTokens());
CompositeTextStatsCalculator calc = new CompositeTextStatsCalculator(calcs);
Map<Class, Object> stats = calc.calculate(txt);
+
+ List<Language> probabilities = (List<Language>) stats.get(LanguageIDWrapper.class);
+ assertEquals("cmn", probabilities.get(0).getLanguage());
+ assertEquals(0.009, probabilities.get(1).getConfidence(), 0.01);
+
+
String textProfileSignature = (String)stats.get(TextProfileSignature.class);
assertEquals("XKXLY6FNIGK2KGEF6HOSKSVGYDLLOFIAGO73RLMJ22PZVXBTXFFA====", textProfileSignature);