You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/08/13 18:38:06 UTC
[tika] branch master updated: TIKA-2906 -- add examples for
textstats
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 8bc50ec TIKA-2906 -- add examples for textstats
8bc50ec is described below
commit 8bc50ecf2f06e65eb86637b813e199b6ba59fcc4
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue Aug 13 14:37:48 2019 -0400
TIKA-2906 -- add examples for textstats
---
.../org/apache/tika/eval/AbstractProfiler.java | 89 +++++++++++-----------
.../apache/tika/eval/langid/LanguageIDWrapper.java | 15 +++-
.../apache/tika/eval/textstats/CommonTokens.java | 3 +
.../textstats/CompositeTextStatsCalculator.java | 6 +-
.../apache/tika/eval/tokens/AnalyzerManager.java | 2 +-
.../tika/eval/tokens/CommonTokenCountManager.java | 5 +-
.../apache/tika/eval/tokens/CommonTokenResult.java | 4 +
tika-example/pom.xml | 5 ++
.../apache/tika/example/TextStatsFromTikaEval.java | 62 +++++++++++++++
.../tika/example/TextStatsFromTikaEvalTest.java | 31 ++++++++
10 files changed, 173 insertions(+), 49 deletions(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 7c3a436..becccf3 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -49,6 +49,7 @@ import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.eval.io.ExtractReaderException;
import org.apache.tika.eval.io.IDBWriter;
import org.apache.tika.eval.langid.Language;
+import org.apache.tika.eval.langid.LanguageIDWrapper;
import org.apache.tika.eval.textstats.BasicTokenCountStatsCalculator;
import org.apache.tika.eval.textstats.CommonTokens;
import org.apache.tika.eval.textstats.CompositeTextStatsCalculator;
@@ -63,9 +64,8 @@ import org.apache.tika.eval.tokens.CommonTokenCountManager;
import org.apache.tika.eval.tokens.CommonTokenResult;
import org.apache.tika.eval.tokens.TokenCounts;
import org.apache.tika.eval.tokens.TokenIntPair;
-import org.apache.tika.eval.util.ContentTags;
import org.apache.tika.eval.util.ContentTagParser;
-import org.apache.tika.eval.langid.LanguageIDWrapper;
+import org.apache.tika.eval.util.ContentTags;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PagedText;
@@ -144,6 +144,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
tmp.put("UL", Cols.TAGS_UL);
return Collections.unmodifiableMap(tmp);
}
+
private static CommonTokenCountManager COMMON_TOKEN_COUNT_MANAGER;
private String lastExtractExtension = null;
@@ -192,13 +193,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
private final static Pattern ENCRYPTION_EXCEPTION =
Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException");
- private TikaConfig config = TikaConfig.getDefaultConfig();//TODO: allow configuration
+ private static LanguageIDWrapper LANG_ID = new LanguageIDWrapper();
+
+ //TODO: allow configuration
+ //private TikaConfig config = TikaConfig.getDefaultConfig();
CompositeTextStatsCalculator compositeTextStatsCalculator;
protected IDBWriter writer;
/**
- *
* @param p path to the common_tokens directory. If this is null, try to load from classPath
+ * @param defaultLangCode this is the language code to use if a common_words list doesn't exist for the
+ * detected langauge; can be <code>null</code>
* @throws IOException
*/
public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException {
@@ -210,30 +215,26 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
super(fileQueue);
this.writer = writer;
LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
- this.compositeTextStatsCalculator = initAnalyzersAndTokenCounter(maxTokens, new LanguageIDWrapper());
+ this.compositeTextStatsCalculator = initAnalyzersAndTokenCounter(maxTokens, LANG_ID);
}
private CompositeTextStatsCalculator initAnalyzersAndTokenCounter(int maxTokens, LanguageIDWrapper langIder) {
- try {
- analyzerManager = AnalyzerManager.newInstance(maxTokens);
- List<TextStatsCalculator> calculators = new ArrayList<>();
- calculators.add(new CommonTokens(COMMON_TOKEN_COUNT_MANAGER));
- calculators.add(new TokenEntropy());
- calculators.add(new TokenLengths());
- calculators.add(new TopNTokens(10));
- calculators.add(new BasicTokenCountStatsCalculator());
- calculators.add(new ContentLengthCalculator());
- calculators.add(new UnicodeBlockCounter(maxContentLengthForLangId));
-
- return new CompositeTextStatsCalculator(calculators, analyzerManager.getGeneralAnalyzer(), langIder);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
+ analyzerManager = AnalyzerManager.newInstance(maxTokens);
+ List<TextStatsCalculator> calculators = new ArrayList<>();
+ calculators.add(new CommonTokens(COMMON_TOKEN_COUNT_MANAGER));
+ calculators.add(new TokenEntropy());
+ calculators.add(new TokenLengths());
+ calculators.add(new TopNTokens(10));
+ calculators.add(new BasicTokenCountStatsCalculator());
+ calculators.add(new ContentLengthCalculator());
+ calculators.add(new UnicodeBlockCounter(maxContentLengthForLangId));
+ return new CompositeTextStatsCalculator(calculators, analyzerManager.getGeneralAnalyzer(), langIder);
}
/**
* Truncate the content string if greater than this length to this length
+ *
* @param maxContentLength
*/
public void setMaxContentLength(int maxContentLength) {
@@ -281,7 +282,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
data.put(Cols.CONTAINER_ID, containerId);
data.put(Cols.MD5, m.get(DIGEST_KEY));
- if ( i < numAttachments.size()) {
+ if (i < numAttachments.size()) {
data.put(Cols.NUM_ATTACHMENTS, Integer.toString(numAttachments.get(i)));
}
data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
@@ -376,8 +377,9 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
if (content == null || content.trim().length() == 0) {
content = "";
}
- return compositeTextStatsCalculator.calculate(content);
+ return compositeTextStatsCalculator.calculate(content);
}
+
/**
* Checks to see if metadata is null or content is empty (null or only whitespace).
* If any of these, then this does no processing, and the fileId is not
@@ -398,7 +400,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
langid(textStats, data);
writeTokenCounts(textStats, data);
- CommonTokenResult commonTokenResult = (CommonTokenResult)textStats.get(CommonTokens.class);
+ CommonTokenResult commonTokenResult = (CommonTokenResult) textStats.get(CommonTokens.class);
if (commonTokenResult != null) {
data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
data.put(Cols.NUM_UNIQUE_COMMON_TOKENS, Integer.toString(commonTokenResult.getUniqueCommonTokens()));
@@ -408,7 +410,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
data.put(Cols.NUM_ALPHABETIC_TOKENS,
Integer.toString(commonTokenResult.getAlphabeticTokens()));
}
- TokenCounts tokenCounts = (TokenCounts)textStats.get(BasicTokenCountStatsCalculator.class);
+ TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class);
if (tokenCounts != null) {
data.put(Cols.NUM_UNIQUE_TOKENS,
@@ -418,10 +420,10 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
if (textStats.get(TokenEntropy.class) != null) {
data.put(Cols.TOKEN_ENTROPY_RATE,
- Double.toString((Double)textStats.get(TokenEntropy.class)));
+ Double.toString((Double) textStats.get(TokenEntropy.class)));
}
- SummaryStatistics summStats = (SummaryStatistics)textStats.get(TokenLengths.class);
+ SummaryStatistics summStats = (SummaryStatistics) textStats.get(TokenLengths.class);
if (summStats != null) {
data.put(Cols.TOKEN_LENGTH_SUM,
Integer.toString((int) summStats.getSum()));
@@ -556,6 +558,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
return c;
}
+
protected static ContentTags getContent(EvalFilePaths evalFilePaths, Metadata metadata) {
if (metadata == null) {
return ContentTags.EMPTY_CONTENT_TAGS;
@@ -565,7 +568,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
void unicodeBlocks(Map<Class, Object> tokenStats, Map<Cols, String> data) {
- Map<String, MutableInt> blocks = (Map<String, MutableInt>)tokenStats.get(UnicodeBlockCounter.class);
+ Map<String, MutableInt> blocks = (Map<String, MutableInt>) tokenStats.get(UnicodeBlockCounter.class);
List<Pair<String, Integer>> pairs = new ArrayList<>();
for (Map.Entry<String, MutableInt> e : blocks.entrySet()) {
pairs.add(Pair.of(e.getKey(), e.getValue().intValue()));
@@ -582,18 +585,18 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
if (i > 0) {
sb.append(" | ");
}
- sb.append(pairs.get(i).getKey()+": "+pairs.get(i).getValue());
+ sb.append(pairs.get(i).getKey() + ": " + pairs.get(i).getValue());
}
data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
}
void langid(Map<Class, Object> stats, Map<Cols, String> data) {
- List<Language> probabilities = (List<Language>)stats.get(LanguageIDWrapper.class);
+ List<Language> probabilities = (List<Language>) stats.get(LanguageIDWrapper.class);
if (probabilities.size() > 0) {
data.put(Cols.LANG_ID_1, probabilities.get(0).getLanguage());
data.put(Cols.LANG_ID_PROB_1,
- Double.toString(probabilities.get(0).getConfidence()));
+ Double.toString(probabilities.get(0).getConfidence()));
}
if (probabilities.size() > 1) {
data.put(Cols.LANG_ID_2, probabilities.get(1).getLanguage());
@@ -615,7 +618,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
void writeTokenCounts(Map<Class, Object> textStats, Map<Cols, String> data) {
- TokenIntPair[] tokenIntPairs = (TokenIntPair[])textStats.get(TopNTokens.class);
+ TokenIntPair[] tokenIntPairs = (TokenIntPair[]) textStats.get(TopNTokens.class);
int i = 0;
StringBuilder sb = new StringBuilder();
for (TokenIntPair t : tokenIntPairs) {
@@ -635,7 +638,6 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
/**
- *
* @param metadata
* @param extracts
* @return evalfilepaths for files if crawling an extract directory
@@ -647,7 +649,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
//just try slapping the relextractfilepath on the extractdir
Path extractFile = extracts.resolve(relExtractFilePath);
- if (! Files.isRegularFile(extractFile)) {
+ if (!Files.isRegularFile(extractFile)) {
//if that doesn't work, try to find the right extract file.
//This is necessary if crawling extractsA and trying to find a file in
//extractsB that is not in the same format: json vs txt or compressed
@@ -655,6 +657,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
return new EvalFilePaths(relativeSourceFilePath, extractFile);
}
+
//call this if the crawler is crawling through the src directory
protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir,
Path extracts) {
@@ -673,7 +676,6 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
/**
- *
* @param extractRootDir
* @param relativeSourceFilePath
* @return extractFile or null if couldn't find one.
@@ -681,16 +683,16 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
private Path findFile(Path extractRootDir, Path relativeSourceFilePath) {
String relSrcFilePathString = relativeSourceFilePath.toString();
if (lastExtractExtension != null) {
- Path candidate = extractRootDir.resolve(relSrcFilePathString+lastExtractExtension);
+ Path candidate = extractRootDir.resolve(relSrcFilePathString + lastExtractExtension);
if (Files.isRegularFile(candidate)) {
return candidate;
}
}
for (String ext : EXTRACT_EXTENSIONS) {
for (String compress : COMPRESSION_EXTENSIONS) {
- Path candidate = extractRootDir.resolve(relSrcFilePathString+ext+compress);
+ Path candidate = extractRootDir.resolve(relSrcFilePathString + ext + compress);
if (Files.isRegularFile(candidate)) {
- lastExtractExtension = ext+compress;
+ lastExtractExtension = ext + compress;
return candidate;
}
}
@@ -737,7 +739,6 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
/**
- *
* @param list
* @return empty list if input list is empty or null
*/
@@ -747,7 +748,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
return ret;
}
//container document attachment count = list.size()-1
- ret.add(list.size()-1);
+ ret.add(list.size() - 1);
Map<String, Integer> counts = new HashMap<>();
for (int i = 1; i < list.size(); i++) {
@@ -758,7 +759,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
String[] parts = path.split("/");
StringBuilder parent = new StringBuilder();
- for (int end = 1; end < parts.length-1; end++) {
+ for (int end = 1; end < parts.length - 1; end++) {
parent.setLength(0);
join("/", parent, parts, 1, end);
String parentPath = parent.toString();
@@ -801,7 +802,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
if (evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".html")) {
try {
return ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
- } catch (IOException|SAXException e) {
+ } catch (IOException | SAXException e) {
LOG.warn("Problem parsing html in {}; backing off to treat string as text",
evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
@@ -809,17 +810,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
} else if (
evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".xhtml") ||
- (handlerClass != null && handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
+ (handlerClass != null && handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
try {
return ContentTagParser.parseXML(s, UC_TAGS_OF_INTEREST.keySet());
- } catch (TikaException|IOException|SAXException e) {
+ } catch (TikaException | IOException | SAXException e) {
LOG.warn("Problem parsing xhtml in {}; backing off to html parser",
evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
try {
ContentTags contentTags = ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
contentTags.setParseException(true);
return contentTags;
- } catch (IOException|SAXException e2) {
+ } catch (IOException | SAXException e2) {
LOG.warn("Problem parsing html in {}; backing off to treat string as text",
evalFilePaths.getExtractFile().toAbsolutePath().toString(), e2);
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java b/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java
index cec5037..fc2ee0c 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/langid/LanguageIDWrapper.java
@@ -31,14 +31,18 @@ import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
import org.apache.tika.eval.textstats.StringStatsCalculator;
-
+/**
+ * The most efficient way to call this in a multithreaded environment
+ * is to call {@link LanguageIDWrapper#loadBuiltInModels()} before
+ * instantiating the
+ */
public class LanguageIDWrapper implements StringStatsCalculator<List<Language>> {
static LanguageDetectorModel LANG_MODEL;
static int MAX_TEXT_LENGTH = 50000;
- public static void loadBuiltInModels() throws IOException {
+ public static synchronized void loadBuiltInModels() throws IOException {
try (InputStream is = LanguageIDWrapper.class.getResourceAsStream(
"/opennlp/model_20190626.bin"
)) {
@@ -62,6 +66,13 @@ public class LanguageIDWrapper implements StringStatsCalculator<List<Language>>
private final opennlp.tools.langdetect.LanguageDetector detector;
public LanguageIDWrapper() {
+ if (LANG_MODEL == null) {
+ try {
+ loadBuiltInModels();
+ } catch (IOException e) {
+ throw new RuntimeException("couldn't load built in lang models", e);
+ }
+ }
detector = new ProbingLanguageDetector(LANG_MODEL);
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokens.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokens.java
index 1b47098..cb1dee1 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokens.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CommonTokens.java
@@ -33,6 +33,9 @@ public class CommonTokens implements LanguageAwareTokenCountStats<CommonTokenRes
private final CommonTokenCountManager commonTokenCountManager;
+ public CommonTokens() {
+ this(new CommonTokenCountManager());
+ }
public CommonTokens(CommonTokenCountManager mgr) {
this.commonTokenCountManager = mgr;
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
index da8cd79..2c7c673 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
@@ -27,12 +27,14 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.langid.LanguageIDWrapper;
+import org.apache.tika.eval.tokens.AnalyzerManager;
import org.apache.tika.eval.tokens.TokenCounts;
public class CompositeTextStatsCalculator {
private static final String FIELD = "f";
+ private static final int DEFAULT_MAX_TOKENS = 10_000_000;
private final Analyzer analyzer;
private final LanguageIDWrapper languageIDWrapper;
private final List<LanguageAwareTokenCountStats> languageAwareTokenCountStats = new ArrayList<>();
@@ -40,7 +42,9 @@ public class CompositeTextStatsCalculator {
private final List<StringStatsCalculator> stringStatCalculators = new ArrayList<>();
public CompositeTextStatsCalculator(List<TextStatsCalculator> calculators) {
- this(calculators, null, null);
+ this(calculators,
+ AnalyzerManager.newInstance(DEFAULT_MAX_TOKENS).getGeneralAnalyzer(),
+ new LanguageIDWrapper());
}
public CompositeTextStatsCalculator(List<TextStatsCalculator> calculators, Analyzer analyzer,
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
index c5aa831..7cb8ae6 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
@@ -42,7 +42,7 @@ public class AnalyzerManager {
this.commonTokensAnalyzer = commonTokensAnalyzer;
}
- public static AnalyzerManager newInstance(int maxTokens) throws IOException {
+ public static AnalyzerManager newInstance(int maxTokens) {
InputStream is = AnalyzerManager.class.getClassLoader().getResourceAsStream("lucene-analyzers.json");
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
GsonBuilder builder = new GsonBuilder();
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
index 48dbdcd..8c8f887 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
@@ -56,7 +56,10 @@ public class CommonTokenCountManager {
//make this configurable
private final String defaultLangCode;
- public CommonTokenCountManager(Path commonTokensDir, String defaultLangCode) throws IOException {
+ public CommonTokenCountManager() {
+ this(null, null);
+ }
+ public CommonTokenCountManager(Path commonTokensDir, String defaultLangCode) {
if (defaultLangCode == null) {
defaultLangCode = "";
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
index beed41a..dbcb266 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
@@ -34,6 +34,10 @@ public class CommonTokenResult {
this.alphabeticTokens = alphabeticTokens;
}
+ /**
+ *
+ * @return the language used to select the common_tokens list
+ */
public String getLangCode() {
return langCode;
}
diff --git a/tika-example/pom.xml b/tika-example/pom.xml
index 0c4305a..391f42e 100644
--- a/tika-example/pom.xml
+++ b/tika-example/pom.xml
@@ -75,6 +75,11 @@
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
+ <artifactId>tika-eval</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
diff --git a/tika-example/src/main/java/org/apache/tika/example/TextStatsFromTikaEval.java b/tika-example/src/main/java/org/apache/tika/example/TextStatsFromTikaEval.java
new file mode 100644
index 0000000..1b545b5
--- /dev/null
+++ b/tika-example/src/main/java/org/apache/tika/example/TextStatsFromTikaEval.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.example;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.eval.textstats.CommonTokens;
+import org.apache.tika.eval.textstats.CompositeTextStatsCalculator;
+import org.apache.tika.eval.textstats.TextStatsCalculator;
+import org.apache.tika.eval.tokens.CommonTokenResult;
+
+/**
+ * These examples create a new {@link CompositeTextStatsCalculator}
+ * for each call. This is extremely inefficient because the lang id
+ * model has to be loaded and the common words for each call.
+ *
+ */
+public class TextStatsFromTikaEval {
+
+ /**
+ * Use the default language id models and the default common tokens
+ * lists in tika-eval to calculate the out-of-vocabulary percentage
+ * for a given string.
+ *
+ * @param txt
+ * @return
+ */
+ public double getOOV(String txt) {
+ List<TextStatsCalculator> calculators = new ArrayList<>();
+ calculators.add(new CommonTokens());
+ CompositeTextStatsCalculator calc = new CompositeTextStatsCalculator(calculators);
+ Map<Class, Object> results = calc.calculate(txt);
+
+ /*
+ Note that the OOV requires language id, so you can also
+ retrieve the detected languages with this:
+
+ List<Language> detectedLanguages = (List<Language>) results.get(LanguageIDWrapper.class);
+
+ */
+
+ CommonTokenResult result = (CommonTokenResult)results.get(CommonTokens.class);
+ result.getLangCode();
+ return result.getOOV();
+ }
+}
diff --git a/tika-example/src/test/java/org/apache/tika/example/TextStatsFromTikaEvalTest.java b/tika-example/src/test/java/org/apache/tika/example/TextStatsFromTikaEvalTest.java
new file mode 100644
index 0000000..273fb60
--- /dev/null
+++ b/tika-example/src/test/java/org/apache/tika/example/TextStatsFromTikaEvalTest.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.example;
+
+import static org.junit.Assert.assertEquals;
+
+import org.junit.Test;
+
+public class TextStatsFromTikaEvalTest {
+
+ @Test
+ public void testOOV() throws Exception {
+ TextStatsFromTikaEval textStats = new TextStatsFromTikaEval();
+ String s = "the quick brown fox jumped over the lazy dog asdfas asdf asdfasfasf";
+ assertEquals(textStats.getOOV(s), 0.25, 0.01);
+ }
+}