You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/06 15:58:12 UTC
[tika] 02/03: TIKA-2317 -- warn when content string is truncated,
allow easier parameterization of other limits via commandline.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 246133a2d4ba6980217e04efabacef652a4a460c
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Apr 6 11:56:45 2017 -0400
TIKA-2317 -- warn when content string is truncated, allow easier
parameterization of other limits via commandline.
---
.../org/apache/tika/eval/AbstractProfiler.java | 86 +++++++++++++++++-----
.../java/org/apache/tika/eval/ExtractComparer.java | 6 ++
.../java/org/apache/tika/eval/ExtractProfiler.java | 6 +-
.../tika/eval/batch/EvalConsumerBuilder.java | 21 ++++++
.../tika/eval/batch/ExtractComparerBuilder.java | 4 +-
.../tika/eval/batch/ExtractProfilerBuilder.java | 5 +-
.../main/java/org/apache/tika/eval/db/Cols.java | 3 +-
.../java/org/apache/tika/eval/reports/Report.java | 1 +
.../tika/eval/tokens/AnalyzerDeserializer.java | 26 +++++--
.../apache/tika/eval/tokens/AnalyzerManager.java | 6 +-
tika-eval/src/main/resources/lucene-analyzers.json | 7 --
.../main/resources/tika-eval-comparison-config.xml | 6 ++
.../main/resources/tika-eval-profiler-config.xml | 5 ++
.../org/apache/tika/eval/AnalyzerManagerTest.java | 7 +-
.../org/apache/tika/eval/SimpleComparerTest.java | 46 ++++++++++--
.../java/org/apache/tika/eval/TikaEvalCLITest.java | 20 ++++-
.../apache/tika/eval/tokens/TokenCounterTest.java | 2 +-
17 files changed, 204 insertions(+), 53 deletions(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 4d04f23..67dee85 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -108,8 +108,9 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
private static CommonTokenCountManager commonTokenCountManager;
private String lastExtractExtension = null;
- final AnalyzerManager analyzerManager;
- final TokenCounter tokenCounter;
+ AnalyzerManager analyzerManager;
+ TokenCounter tokenCounter;
+
public enum EXCEPTION_TYPE {
RUNTIME,
@@ -136,9 +137,11 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
private static Pattern FILE_NAME_CLEANER = Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
- final static int FILE_PATH_MAX_LEN = 512;//max len for varchar for file_path
- final static int MAX_STRING_LENGTH = 1000000;
- final static int MAX_LEN_FOR_LANG_ID = 20000;
+ final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for file_path
+ int maxContentLength = 10000000;
+ int maxContentLengthForLangId = 50000;
+ int maxTokens = 200000;
+
//these remove runtime info from the stacktraces so
//that actual causes can be counted.
@@ -168,14 +171,45 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
super(fileQueue);
this.writer = writer;
langIder = new LanguageIDWrapper();
+ initAnalyzersAndTokenCounter(maxTokens);
+ }
+
+ private void initAnalyzersAndTokenCounter(int maxTokens) {
try {
- analyzerManager = AnalyzerManager.newInstance();
+ analyzerManager = AnalyzerManager.newInstance(maxTokens);
tokenCounter = new TokenCounter(analyzerManager.getGeneralAnalyzer());
} catch (IOException e) {
throw new RuntimeException(e);
}
+
+ }
+
+ /**
+ * Truncate the content string if greater than this length to this length
+ * @param maxContentLength
+ */
+ public void setMaxContentLength(int maxContentLength) {
+ this.maxContentLength = maxContentLength;
}
+ /**
+ * Truncate content string if greater than this length to this length for lang id
+ *
+ * @param maxContentLengthForLangId
+ */
+ public void setMaxContentLengthForLangId(int maxContentLengthForLangId) {
+ this.maxContentLengthForLangId = maxContentLengthForLangId;
+ }
+
+ /**
+ * Add a LimitTokenCountFilterFactory if > -1
+ *
+ * @param maxTokens
+ */
+ public void setMaxTokens(int maxTokens) {
+ this.maxTokens = maxTokens;
+ initAnalyzersAndTokenCounter(maxTokens);
+ }
protected void writeExtractException(TableInfo extractExceptionTable, String containerId,
@@ -233,7 +267,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
data.put(Cols.ELAPSED_TIME_MILLIS,
getTime(m));
- String content = getContent(m, MAX_STRING_LENGTH);
+ String content = getContent(m, maxContentLength);
if (content == null || content.trim().length() == 0) {
data.put(Cols.HAS_CONTENT, FALSE);
} else {
@@ -298,15 +332,14 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
if (m == null) {
return;
}
-
- String content = getContent(m, MAX_STRING_LENGTH);
+ Map<Cols, String> data = new HashMap<>();
+ String content = getContent(m, maxContentLength, data);
if (content == null || content.trim().length() == 0) {
return;
}
tokenCounter.clear(fieldName);
tokenCounter.add(fieldName, content);
- Map<Cols, String> data = new HashMap<>();
data.put(Cols.ID, fileId);
data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
langid(m, data);
@@ -415,6 +448,24 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
}
+ /**
+ * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
+ *
+ * @param metadata
+ * @param maxLength
+ * @param data
+ * @return
+ */
+ protected static String getContent(Metadata metadata, int maxLength, Map<Cols, String> data) {
+ data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
+ String c = getContent(metadata, maxLength);
+ if (c.length() > maxLength) {
+ c = c.substring(0, maxLength);
+ data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
+ }
+ return c;
+
+ }
protected static String getContent(Metadata metadata, int maxLength) {
if (metadata == null) {
return "";
@@ -423,20 +474,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
if (c == null) {
return "";
}
- if (c.length() > maxLength) {
- c = c.substring(0, maxLength);
- }
return c;
}
void unicodeBlocks(Metadata metadata, Map<Cols, String> data) {
- String content = getContent(metadata, MAX_LEN_FOR_LANG_ID);
+ String content = getContent(metadata, maxContentLengthForLangId);
if (content.length() < 200) {
return;
}
String s = content;
- if (content.length() > MAX_LEN_FOR_LANG_ID) {
- s = content.substring(0, MAX_LEN_FOR_LANG_ID);
+ if (content.length() > maxContentLengthForLangId) {
+ s = content.substring(0, maxContentLengthForLangId);
}
Map<String, Integer> m = new HashMap<>();
Reader r = new StringReader(s);
@@ -483,13 +531,13 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
}
void langid(Metadata metadata, Map<Cols, String> data) {
- String content = getContent(metadata, MAX_LEN_FOR_LANG_ID);
+ String content = getContent(metadata, maxContentLengthForLangId);
if (content.length() < 50) {
return;
}
String s = content;
- if (content.length() > MAX_LEN_FOR_LANG_ID) {
- s = content.substring(0, MAX_LEN_FOR_LANG_ID);
+ if (content.length() > maxContentLengthForLangId) {
+ s = content.substring(0, maxContentLengthForLangId);
}
List<DetectedLanguage> probabilities = langIder.getProbabilities(s);
if (probabilities.size() > 0) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index a50b710..9caef9f 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -80,6 +80,12 @@ public class ExtractComparer extends AbstractProfiler {
.addOption("tablePrefixB", true, "EXPERT: optional prefix for table names for B")
.addOption("drop", true, "drop tables if they exist")
.addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
+ .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats")
+ .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id")
+ .addOption("maxTokens", true, "maximum tokens to process, default=200000")
+ .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
+ .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
+
;
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index 1f9bfda..9b7ddc4 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -68,6 +68,9 @@ public class ExtractProfiler extends AbstractProfiler {
.addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
.addOption("drop", true, "drop tables if they exist")
.addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
+ .addOption("maxTokens", true, "maximum tokens to process, default=200000")
+ .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
+ .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
;
@@ -145,7 +148,8 @@ public class ExtractProfiler extends AbstractProfiler {
new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT),
new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER),
new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT),
- new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT)
+ new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT),
+ new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)
);
private final Path inputDir;
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
index bad8f61..6e9b6c9 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
@@ -174,6 +174,27 @@ public abstract class EvalConsumerBuilder {
return new ExtractReader(alterExtractList, minExtractLength, maxExtractLength);
}
+ FileResourceConsumer parameterizeProfiler(AbstractProfiler abstractProfiler) {
+
+ int maxContentLength = PropsUtil.getInt(localAttrs.get("maxContentLength"), -1);
+ if (maxContentLength > -1) {
+ abstractProfiler.setMaxContentLength(maxContentLength);
+ }
+
+ int maxContentLengthForLangId = PropsUtil.getInt(localAttrs.get("maxContentLengthForLangId"), -1);
+ if (maxContentLengthForLangId > -1) {
+ abstractProfiler.setMaxContentLengthForLangId(maxContentLengthForLangId);
+ }
+
+ int maxTokens = PropsUtil.getInt(localAttrs.get("maxTokens"), -1);
+ if (maxTokens > -1) {
+ abstractProfiler.setMaxTokens(maxTokens);
+ }
+
+
+ return abstractProfiler;
+ }
+
/*
public abstract Map<String, String> getIndexInfo();
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
index b9c5ee3..3cd428a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
@@ -90,9 +90,9 @@ public class ExtractComparerBuilder extends EvalConsumerBuilder {
throw new RuntimeException("Must specify an -inputDir");
}
- return new ExtractComparer(queue, inputRootDir, extractsA, extractsB,
+ return parameterizeProfiler(new ExtractComparer(queue, inputRootDir, extractsA, extractsB,
buildExtractReader(localAttrs),
- getDBWriter(getNonRefTableInfos()));
+ getDBWriter(getNonRefTableInfos())));
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
index f89eeb0..11310ee 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
@@ -78,11 +78,12 @@ public class ExtractProfilerBuilder extends EvalConsumerBuilder {
if (extracts == null && inputDir != null) {
extracts = inputDir;
}
- return new ExtractProfiler(queue, inputDir, extracts,
+ return parameterizeProfiler(new ExtractProfiler(queue, inputDir, extracts,
buildExtractReader(localAttrs),
- getDBWriter(tableInfos));
+ getDBWriter(tableInfos)));
}
+
@Override
protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index bf8784b..91917ec 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -57,6 +57,7 @@ public enum Cols {
TOKEN_LENGTH_STD_DEV,
UNICODE_CHAR_BLOCKS,
NUM_PAGES, //number of pages a document alleges it has
+ CONTENT_TRUNCATED_AT_MAX_LEN, // was the string truncated at AbstractProfiler.MAX_STRING_LENGTH
//content comparisons
TOP_10_UNIQUE_TOKEN_DIFFS_A,
@@ -86,5 +87,5 @@ public enum Cols {
DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
DIR_NAME_B
- }
+}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
index 3683a71..8ac7fca 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
@@ -173,6 +173,7 @@ public class Report {
}
break;
//fall through strings
+ case Types.BOOLEAN:
case Types.CHAR:
case Types.VARCHAR:
case Types.LONGNVARCHAR:
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
index 83ca557..2389309 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
@@ -35,6 +35,7 @@ import com.google.gson.JsonParseException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
import org.apache.lucene.analysis.util.CharFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
@@ -52,6 +53,12 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
private static final String PARAMS = "params";
private static final String COMMENT = "_comment";
+ private final int maxTokens;
+
+ AnalyzerDeserializer(int maxTokens) {
+ this.maxTokens = maxTokens;
+ }
+
@Override
public Map<String, Analyzer> deserialize(JsonElement element, Type type,
JsonDeserializationContext jsonDeserializationContext) throws JsonParseException {
@@ -64,14 +71,14 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
throw new IllegalArgumentException("Expecting top level 'analyzers:{}");
}
try {
- return buildAnalyzers(root);
+ return buildAnalyzers(root, maxTokens);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
- public static Map<String, Analyzer> buildAnalyzers(JsonElement value) throws IOException {
+ public static Map<String, Analyzer> buildAnalyzers(JsonElement value, int maxTokens) throws IOException {
if (! value.isJsonObject()) {
throw new IllegalArgumentException("Expecting map with analyzer names/analyzer definitions");
}
@@ -79,13 +86,13 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
JsonObject root = (JsonObject)value;
for (Map.Entry<String, JsonElement> e : root.entrySet()) {
String analyzerName = e.getKey();
- Analyzer analyzer = buildAnalyzer(analyzerName, e.getValue());
+ Analyzer analyzer = buildAnalyzer(analyzerName, e.getValue(), maxTokens);
analyzers.put(analyzerName, analyzer);
}
return analyzers;
}
- public static Analyzer buildAnalyzer(String analyzerName, JsonElement value) throws IOException {
+ public static Analyzer buildAnalyzer(String analyzerName, JsonElement value, int maxTokens) throws IOException {
if (! value.isJsonObject()) {
throw new IllegalArgumentException("Expecting map of charfilter, tokenizer, tokenfilters");
}
@@ -98,7 +105,7 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
if (k.equals(CHAR_FILTERS)) {
charFilters = buildCharFilters(e.getValue(), analyzerName);
} else if (k.equals(TOKEN_FILTERS)) {
- tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName);
+ tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens);
} else if (k.equals(TOKENIZER)) {
tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName);
} else if (! k.equals(COMMENT)) {
@@ -212,7 +219,7 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
}
private static TokenFilterFactory[] buildTokenFilterFactories(JsonElement el,
- String analyzerName) throws IOException {
+ String analyzerName, int maxTokens) throws IOException {
if (el == null || el.isJsonNull()) {
return null;
}
@@ -261,6 +268,13 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
throw new IllegalArgumentException("While loading "+analyzerName, e);
}
}
+
+ if (maxTokens > -1) {
+ Map<String, String> m = new HashMap<>();
+ m.put("maxTokenCount", Integer.toString(maxTokens));
+ ret.add(new LimitTokenCountFilterFactory(m));
+ }
+
if (ret.size() == 0) {
return new TokenFilterFactory[0];
}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
index 903b130..c5aa831 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
@@ -42,11 +42,11 @@ public class AnalyzerManager {
this.commonTokensAnalyzer = commonTokensAnalyzer;
}
- public static AnalyzerManager newInstance() throws IOException {
+ public static AnalyzerManager newInstance(int maxTokens) throws IOException {
InputStream is = AnalyzerManager.class.getClassLoader().getResourceAsStream("lucene-analyzers.json");
Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
GsonBuilder builder = new GsonBuilder();
- builder.registerTypeHierarchyAdapter(Map.class, new AnalyzerDeserializer());
+ builder.registerTypeHierarchyAdapter(Map.class, new AnalyzerDeserializer(maxTokens));
Gson gson = builder.create();
Map<String, Analyzer> map = gson.fromJson(reader, Map.class);
Analyzer general = map.get(GENERAL);
@@ -59,7 +59,7 @@ public class AnalyzerManager {
throw new JsonParseException("Must specify "+ COMMON_TOKENS + " analyzer");
}
- return new AnalyzerManager(general,common);
+ return new AnalyzerManager(general, common);
}
/**
diff --git a/tika-eval/src/main/resources/lucene-analyzers.json b/tika-eval/src/main/resources/lucene-analyzers.json
index fd02fa7..aa24b79 100644
--- a/tika-eval/src/main/resources/lucene-analyzers.json
+++ b/tika-eval/src/main/resources/lucene-analyzers.json
@@ -23,13 +23,6 @@
"params": {
"outputUnigrams": "false"
}
- },
- {
- "factory": "oala.miscellaneous.LimitTokenCountFilterFactory",
- "params": {
- "maxTokenCount": "1000000",
- "consumeAllTokens": "false"
- }
}
]
},
diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
index 298c864..887a3e7 100644
--- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
@@ -59,6 +59,12 @@
description="EXPERT: prefix for table names for B"/>
<option opt="drop" hasArg="false" description="drop tables if they exist"/>
<option opt="maxFilesToAdd" hasArg="true" description="maximum number of files to add to the crawler"/>
+ <option opt="maxTokens" hasArg="true" description="maximum tokens to process, default=200000"/>
+ <option opt="maxContentLength" hasArg="true"
+ description="truncate content beyond this length for calculating 'contents' stats, default=1000000"/>
+ <option opt="maxContentLengthForLangId" hasArg="true"
+ description="truncate content beyond this length for language id, default=50000"/>
+
</commandline>
diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
index 9da2aeb..a7e6d03 100644
--- a/tika-eval/src/main/resources/tika-eval-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
@@ -54,6 +54,11 @@
description="EXPERT: prefix for table names"/>
<option opt="drop" hasArg="false" description="drop tables if they exist"/>
<option opt="maxFilesToAdd" hasArg="true" description="maximum number of files to add to the crawler"/>
+ <option opt="maxTokens" hasArg="true" description="maximum tokens to process, default=200000"/>
+ <option opt="maxContentLength" hasArg="true"
+ description="truncate content beyond this length for calculating 'contents' stats, default=1000000"/>
+ <option opt="maxContentLengthForLangId" hasArg="true"
+ description="truncate content beyond this length for language id, default=50000"/>
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
index 7b27b5d..9caacd7 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
@@ -36,7 +36,7 @@ public class AnalyzerManagerTest {
@Test
public void testGeneral() throws Exception {
- AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
+ AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
Analyzer general = analyzerManager.getGeneralAnalyzer();
TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog");
ts.reset();
@@ -57,7 +57,7 @@ public class AnalyzerManagerTest {
@Test
public void testCommon() throws Exception {
- AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
+ AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
Analyzer common = analyzerManager.getCommonTokensAnalyzer();
TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
ts.reset();
@@ -80,7 +80,7 @@ public class AnalyzerManagerTest {
@Test
public void testTokenCountFilter() throws Exception {
- AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
+ AnalyzerManager analyzerManager = AnalyzerManager.newInstance(1000000);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 1001000; i++) {
sb.append("the ");
@@ -88,7 +88,6 @@ public class AnalyzerManagerTest {
TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString());
ts.reset();
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
- Set<String> seen = new HashSet<>();
int tokens = 0;
while (ts.incrementToken()) {
tokens++;
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index 85e91dd..761f961 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -26,6 +26,7 @@ import static org.junit.Assert.assertTrue;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
@@ -160,20 +161,24 @@ public class SimpleComparerTest extends TikaTest {
public void testGetContent() throws Exception {
Metadata m = new Metadata();
m.add(RecursiveParserWrapper.TIKA_CONTENT, "0123456789");
-
- String content = getContent(m, 10);
+ Map<Cols, String> data = new HashMap<>();
+ String content = getContent(m, 10, data);
assertEquals(10, content.length());
+ assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
- content = getContent(m, 4);
+ content = getContent(m, 4, data);
assertEquals(4, content.length());
+ assertEquals("TRUE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
//test Metadata with no content
- content = getContent(new Metadata(), 10);
+ content = getContent(new Metadata(), 10, data);
assertEquals(0, content.length());
+ assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
//test null Metadata
- content = getContent(null, 10);
+ content = getContent(null, 10, data);
assertEquals(0, content.length());
+ assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
}
@Test
@@ -288,4 +293,35 @@ public class SimpleComparerTest extends TikaTest {
System.out.println(key + " : " + row.get(key));
}
}
+
+ @Test
+ @Ignore("useful for testing 2 files not in test set")
+ public void oneOff() throws Exception {
+ Path p1 = Paths.get("");
+ Path p2 = Paths.get("");
+
+ EvalFilePaths fpsA = new EvalFilePaths(
+ Paths.get("file1.pdf.json"),
+ p1
+ );
+ EvalFilePaths fpsB = new EvalFilePaths(
+ Paths.get("file1.pdf.json"),
+ p2
+ );
+ comparer.compareFiles(fpsA, fpsB);
+ for (TableInfo t : new TableInfo[]{
+ ExtractComparer.COMPARISON_CONTAINERS,
+ ExtractComparer.EXTRACT_EXCEPTION_TABLE_A,
+ ExtractComparer.EXTRACT_EXCEPTION_TABLE_B,
+ ExtractComparer.EXCEPTION_TABLE_A,
+ ExtractComparer.EXCEPTION_TABLE_B,
+ ExtractComparer.PROFILES_A,
+ ExtractComparer.PROFILES_B,
+ ExtractComparer.CONTENTS_TABLE_A,
+ ExtractComparer.CONTENTS_TABLE_B,
+ ExtractComparer.CONTENT_COMPARISONS}) {
+ debugPrintTable(t);
+ }
+
+ }
}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
index 5274dd4..288f042 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
@@ -58,7 +58,7 @@ public class TikaEvalCLITest extends TikaTest {
compareDBDir = Files.createTempDirectory("tika-eval-cli-compare-db-");
profileDBDir = Files.createTempDirectory("tika-eval-cli-profile-db-");
compareReportsDir = Files.createTempDirectory("tika-eval-cli-compare-reports-");
- profileReportsDir = Files.createTempDirectory("tika-eval-cli-compare-reports-");
+ profileReportsDir = Files.createTempDirectory("tika-eval-cli-profile-reports-");
compare();
profile();
reportCompare();
@@ -115,7 +115,7 @@ public class TikaEvalCLITest extends TikaTest {
cnt++;
}
}
- assertTrue(cnt > 5);
+ assertTrue(cnt > 33);
}
@@ -127,6 +127,14 @@ public class TikaEvalCLITest extends TikaTest {
args.add(extractsDir.resolve("extractsA").toAbsolutePath().toString());
args.add("-extractsB");
args.add(extractsDir.resolve("extractsB").toAbsolutePath().toString());
+ //add these just to confirm this info doesn't cause problems w cli
+ args.add("-maxTokens");
+ args.add("10000000");
+ args.add("-maxContentLength");
+ args.add("100000000");
+ args.add("-maxContentLengthForLangId");
+ args.add("100000");
+
args.add("-db");
args.add(compareDBDir.toAbsolutePath().toString()+"/"+dbName);
@@ -139,6 +147,14 @@ public class TikaEvalCLITest extends TikaTest {
args.add("Profile");
args.add("-extracts");
args.add(extractsDir.resolve("extractsA").toAbsolutePath().toString());
+ //add these just to confirm this info doesn't cause problems w cli
+ args.add("-maxTokens");
+ args.add("10000000");
+ args.add("-maxContentLength");
+ args.add("100000000");
+ args.add("-maxContentLengthForLangId");
+ args.add("100000");
+
args.add("-db");
args.add(profileDBDir.toAbsolutePath().toString()+"/"+dbName);
execute(args, 60000);
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
index 40abdaa..9c6325d 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
@@ -41,7 +41,7 @@ public class TokenCounterTest {
@BeforeClass
public static void setUp() throws IOException {
- analyzerManager = AnalyzerManager.newInstance();
+ analyzerManager = AnalyzerManager.newInstance(100000);
}
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.