You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/06 15:58:12 UTC

[tika] 02/03: TIKA-2317 -- warn when content string is truncated, allow easier parameterization of other limits via commandline.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 246133a2d4ba6980217e04efabacef652a4a460c
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Apr 6 11:56:45 2017 -0400

    TIKA-2317 -- warn when content string is truncated, allow easier
    parameterization of other limits via commandline.
---
 .../org/apache/tika/eval/AbstractProfiler.java     | 86 +++++++++++++++++-----
 .../java/org/apache/tika/eval/ExtractComparer.java |  6 ++
 .../java/org/apache/tika/eval/ExtractProfiler.java |  6 +-
 .../tika/eval/batch/EvalConsumerBuilder.java       | 21 ++++++
 .../tika/eval/batch/ExtractComparerBuilder.java    |  4 +-
 .../tika/eval/batch/ExtractProfilerBuilder.java    |  5 +-
 .../main/java/org/apache/tika/eval/db/Cols.java    |  3 +-
 .../java/org/apache/tika/eval/reports/Report.java  |  1 +
 .../tika/eval/tokens/AnalyzerDeserializer.java     | 26 +++++--
 .../apache/tika/eval/tokens/AnalyzerManager.java   |  6 +-
 tika-eval/src/main/resources/lucene-analyzers.json |  7 --
 .../main/resources/tika-eval-comparison-config.xml |  6 ++
 .../main/resources/tika-eval-profiler-config.xml   |  5 ++
 .../org/apache/tika/eval/AnalyzerManagerTest.java  |  7 +-
 .../org/apache/tika/eval/SimpleComparerTest.java   | 46 ++++++++++--
 .../java/org/apache/tika/eval/TikaEvalCLITest.java | 20 ++++-
 .../apache/tika/eval/tokens/TokenCounterTest.java  |  2 +-
 17 files changed, 204 insertions(+), 53 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 4d04f23..67dee85 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -108,8 +108,9 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
     private static CommonTokenCountManager commonTokenCountManager;
     private String lastExtractExtension = null;
 
-    final AnalyzerManager analyzerManager;
-    final TokenCounter tokenCounter;
+    AnalyzerManager analyzerManager;
+    TokenCounter tokenCounter;
+
 
     public enum EXCEPTION_TYPE {
         RUNTIME,
@@ -136,9 +137,11 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
     private static Pattern FILE_NAME_CLEANER = Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
 
 
-    final static int FILE_PATH_MAX_LEN = 512;//max len for varchar for file_path
-    final static int MAX_STRING_LENGTH = 1000000;
-    final static int MAX_LEN_FOR_LANG_ID = 20000;
+    final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for file_path
+    int maxContentLength = 10000000;
+    int maxContentLengthForLangId = 50000;
+    int maxTokens = 200000;
+
 
     //these remove runtime info from the stacktraces so
     //that actual causes can be counted.
@@ -168,14 +171,45 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         super(fileQueue);
         this.writer = writer;
         langIder = new LanguageIDWrapper();
+        initAnalyzersAndTokenCounter(maxTokens);
+    }
+
+    private void initAnalyzersAndTokenCounter(int maxTokens) {
         try {
-            analyzerManager = AnalyzerManager.newInstance();
+            analyzerManager = AnalyzerManager.newInstance(maxTokens);
             tokenCounter = new TokenCounter(analyzerManager.getGeneralAnalyzer());
         } catch (IOException e) {
             throw new RuntimeException(e);
         }
+
+    }
+
+    /**
+     * Truncate the content string if greater than this length to this length
+     * @param maxContentLength
+     */
+    public void setMaxContentLength(int maxContentLength) {
+        this.maxContentLength = maxContentLength;
     }
 
+    /**
+     * Truncate content string if greater than this length to this length for lang id
+     *
+     * @param maxContentLengthForLangId
+     */
+    public void setMaxContentLengthForLangId(int maxContentLengthForLangId) {
+        this.maxContentLengthForLangId = maxContentLengthForLangId;
+    }
+
+    /**
+     * Add a LimitTokenCountFilterFactory if > -1
+     *
+     * @param maxTokens
+     */
+    public void setMaxTokens(int maxTokens) {
+        this.maxTokens = maxTokens;
+        initAnalyzersAndTokenCounter(maxTokens);
+    }
 
 
     protected void writeExtractException(TableInfo extractExceptionTable, String containerId,
@@ -233,7 +267,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         data.put(Cols.ELAPSED_TIME_MILLIS,
                 getTime(m));
 
-        String content = getContent(m, MAX_STRING_LENGTH);
+        String content = getContent(m, maxContentLength);
         if (content == null || content.trim().length() == 0) {
             data.put(Cols.HAS_CONTENT, FALSE);
         } else {
@@ -298,15 +332,14 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         if (m == null) {
             return;
         }
-
-        String content = getContent(m, MAX_STRING_LENGTH);
+        Map<Cols, String> data = new HashMap<>();
+        String content = getContent(m, maxContentLength, data);
         if (content == null || content.trim().length() == 0) {
             return;
         }
         tokenCounter.clear(fieldName);
         tokenCounter.add(fieldName, content);
 
-        Map<Cols, String> data = new HashMap<>();
         data.put(Cols.ID, fileId);
         data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
         langid(m, data);
@@ -415,6 +448,24 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         }
     }
 
+    /**
+     * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
+     *
+     * @param metadata
+     * @param maxLength
+     * @param data
+     * @return
+     */
+    protected static String getContent(Metadata metadata, int maxLength, Map<Cols, String> data) {
+        data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
+        String c = getContent(metadata, maxLength);
+        if (c.length() > maxLength) {
+            c = c.substring(0, maxLength);
+            data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
+        }
+        return c;
+
+    }
     protected static String getContent(Metadata metadata, int maxLength) {
         if (metadata == null) {
             return "";
@@ -423,20 +474,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         if (c == null) {
             return "";
         }
-        if (c.length() > maxLength) {
-            c = c.substring(0, maxLength);
-        }
         return c;
     }
 
     void unicodeBlocks(Metadata metadata, Map<Cols, String> data) {
-        String content = getContent(metadata, MAX_LEN_FOR_LANG_ID);
+        String content = getContent(metadata, maxContentLengthForLangId);
         if (content.length() < 200) {
             return;
         }
         String s = content;
-        if (content.length() > MAX_LEN_FOR_LANG_ID) {
-            s = content.substring(0, MAX_LEN_FOR_LANG_ID);
+        if (content.length() > maxContentLengthForLangId) {
+            s = content.substring(0, maxContentLengthForLangId);
         }
         Map<String, Integer> m = new HashMap<>();
         Reader r = new StringReader(s);
@@ -483,13 +531,13 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
     }
 
     void langid(Metadata metadata, Map<Cols, String> data) {
-        String content = getContent(metadata, MAX_LEN_FOR_LANG_ID);
+        String content = getContent(metadata, maxContentLengthForLangId);
         if (content.length() < 50) {
             return;
         }
         String s = content;
-        if (content.length() > MAX_LEN_FOR_LANG_ID) {
-            s = content.substring(0, MAX_LEN_FOR_LANG_ID);
+        if (content.length() > maxContentLengthForLangId) {
+            s = content.substring(0, maxContentLengthForLangId);
         }
         List<DetectedLanguage> probabilities = langIder.getProbabilities(s);
         if (probabilities.size() > 0) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index a50b710..9caef9f 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -80,6 +80,12 @@ public class ExtractComparer extends AbstractProfiler {
                 .addOption("tablePrefixB", true, "EXPERT: optional prefix for table names for B")
                 .addOption("drop", true, "drop tables if they exist")
                 .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
+                .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats")
+                .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id")
+                .addOption("maxTokens", true, "maximum tokens to process, default=200000")
+                .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
+                .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
+
         ;
     }
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index 1f9bfda..9b7ddc4 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -68,6 +68,9 @@ public class ExtractProfiler extends AbstractProfiler {
                 .addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
                 .addOption("drop", true, "drop tables if they exist")
                 .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
+                .addOption("maxTokens", true, "maximum tokens to process, default=200000")
+                .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
+                .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
 
         ;
 
@@ -145,7 +148,8 @@ public class ExtractProfiler extends AbstractProfiler {
             new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT),
             new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER),
             new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT),
-            new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT)
+            new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT),
+            new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)
     );
 
     private final Path inputDir;
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
index bad8f61..6e9b6c9 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
@@ -174,6 +174,27 @@ public abstract class EvalConsumerBuilder {
         return new ExtractReader(alterExtractList, minExtractLength, maxExtractLength);
     }
 
+    FileResourceConsumer parameterizeProfiler(AbstractProfiler abstractProfiler) {
+
+        int maxContentLength = PropsUtil.getInt(localAttrs.get("maxContentLength"), -1);
+        if (maxContentLength > -1) {
+            abstractProfiler.setMaxContentLength(maxContentLength);
+        }
+
+        int maxContentLengthForLangId = PropsUtil.getInt(localAttrs.get("maxContentLengthForLangId"), -1);
+        if (maxContentLengthForLangId > -1) {
+            abstractProfiler.setMaxContentLengthForLangId(maxContentLengthForLangId);
+        }
+
+        int maxTokens = PropsUtil.getInt(localAttrs.get("maxTokens"), -1);
+        if (maxTokens > -1) {
+            abstractProfiler.setMaxTokens(maxTokens);
+        }
+
+
+        return abstractProfiler;
+    }
+
 
 /*
     public abstract Map<String, String> getIndexInfo();
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
index b9c5ee3..3cd428a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
@@ -90,9 +90,9 @@ public class ExtractComparerBuilder extends EvalConsumerBuilder {
             throw new RuntimeException("Must specify an -inputDir");
         }
 
-        return new ExtractComparer(queue, inputRootDir, extractsA, extractsB,
+        return parameterizeProfiler(new ExtractComparer(queue, inputRootDir, extractsA, extractsB,
                 buildExtractReader(localAttrs),
-                getDBWriter(getNonRefTableInfos()));
+                getDBWriter(getNonRefTableInfos())));
     }
 
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
index f89eeb0..11310ee 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
@@ -78,11 +78,12 @@ public class ExtractProfilerBuilder extends EvalConsumerBuilder {
         if (extracts == null && inputDir != null) {
             extracts = inputDir;
         }
-        return new ExtractProfiler(queue, inputDir, extracts,
+        return parameterizeProfiler(new ExtractProfiler(queue, inputDir, extracts,
                 buildExtractReader(localAttrs),
-                getDBWriter(tableInfos));
+                getDBWriter(tableInfos)));
     }
 
+
     @Override
     protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
         String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index bf8784b..91917ec 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -57,6 +57,7 @@ public enum Cols {
     TOKEN_LENGTH_STD_DEV,
     UNICODE_CHAR_BLOCKS,
     NUM_PAGES, //number of pages a document alleges it has
+    CONTENT_TRUNCATED_AT_MAX_LEN, // was the string truncated at AbstractProfiler.MAX_STRING_LENGTH
 
     //content comparisons
     TOP_10_UNIQUE_TOKEN_DIFFS_A,
@@ -86,5 +87,5 @@ public enum Cols {
 
     DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
     DIR_NAME_B
-    }
+}
 
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
index 3683a71..8ac7fca 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
@@ -173,6 +173,7 @@ public class Report {
                 }
                 break;
             //fall through strings
+            case Types.BOOLEAN:
             case Types.CHAR:
             case Types.VARCHAR:
             case Types.LONGNVARCHAR:
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
index 83ca557..2389309 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
@@ -35,6 +35,7 @@ import com.google.gson.JsonParseException;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilterFactory;
 import org.apache.lucene.analysis.util.CharFilterFactory;
 import org.apache.lucene.analysis.util.ClasspathResourceLoader;
 import org.apache.lucene.analysis.util.ResourceLoaderAware;
@@ -52,6 +53,12 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
     private static final String PARAMS = "params";
     private static final String COMMENT = "_comment";
 
+    private final int maxTokens;
+
+    AnalyzerDeserializer(int maxTokens) {
+        this.maxTokens = maxTokens;
+    }
+
     @Override
     public Map<String, Analyzer> deserialize(JsonElement element, Type type,
                                              JsonDeserializationContext jsonDeserializationContext) throws JsonParseException {
@@ -64,14 +71,14 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
             throw new IllegalArgumentException("Expecting top level 'analyzers:{}");
         }
         try {
-            return buildAnalyzers(root);
+            return buildAnalyzers(root, maxTokens);
         } catch (IOException e) {
             throw new RuntimeException(e);
         }
 
     }
 
-    public static Map<String, Analyzer> buildAnalyzers(JsonElement value) throws IOException {
+    public static Map<String, Analyzer> buildAnalyzers(JsonElement value, int maxTokens) throws IOException {
         if (! value.isJsonObject()) {
             throw new IllegalArgumentException("Expecting map with analyzer names/analyzer definitions");
         }
@@ -79,13 +86,13 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
         JsonObject root = (JsonObject)value;
         for (Map.Entry<String, JsonElement> e : root.entrySet()) {
             String analyzerName = e.getKey();
-            Analyzer analyzer = buildAnalyzer(analyzerName, e.getValue());
+            Analyzer analyzer = buildAnalyzer(analyzerName, e.getValue(), maxTokens);
             analyzers.put(analyzerName, analyzer);
         }
         return analyzers;
     }
 
-    public static Analyzer buildAnalyzer(String analyzerName, JsonElement value) throws IOException {
+    public static Analyzer buildAnalyzer(String analyzerName, JsonElement value, int maxTokens) throws IOException {
         if (! value.isJsonObject()) {
             throw new IllegalArgumentException("Expecting map of charfilter, tokenizer, tokenfilters");
         }
@@ -98,7 +105,7 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
             if (k.equals(CHAR_FILTERS)) {
                 charFilters = buildCharFilters(e.getValue(), analyzerName);
             } else if (k.equals(TOKEN_FILTERS)) {
-                tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName);
+                tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens);
             } else if (k.equals(TOKENIZER)) {
                 tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName);
             } else if (! k.equals(COMMENT)) {
@@ -212,7 +219,7 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
     }
 
     private static TokenFilterFactory[] buildTokenFilterFactories(JsonElement el,
-                                                                  String analyzerName) throws IOException {
+                                                                  String analyzerName, int maxTokens) throws IOException {
         if (el == null || el.isJsonNull()) {
             return null;
         }
@@ -261,6 +268,13 @@ class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
                 throw new IllegalArgumentException("While loading "+analyzerName, e);
             }
         }
+
+        if (maxTokens > -1) {
+            Map<String, String> m = new HashMap<>();
+            m.put("maxTokenCount", Integer.toString(maxTokens));
+            ret.add(new LimitTokenCountFilterFactory(m));
+        }
+
         if (ret.size() == 0) {
             return new TokenFilterFactory[0];
         }
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
index 903b130..c5aa831 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
@@ -42,11 +42,11 @@ public class AnalyzerManager {
         this.commonTokensAnalyzer = commonTokensAnalyzer;
     }
 
-    public static AnalyzerManager newInstance() throws IOException {
+    public static AnalyzerManager newInstance(int maxTokens) throws IOException {
         InputStream is = AnalyzerManager.class.getClassLoader().getResourceAsStream("lucene-analyzers.json");
         Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8);
         GsonBuilder builder = new GsonBuilder();
-        builder.registerTypeHierarchyAdapter(Map.class, new AnalyzerDeserializer());
+        builder.registerTypeHierarchyAdapter(Map.class, new AnalyzerDeserializer(maxTokens));
         Gson gson = builder.create();
         Map<String, Analyzer> map = gson.fromJson(reader, Map.class);
         Analyzer general = map.get(GENERAL);
@@ -59,7 +59,7 @@ public class AnalyzerManager {
             throw new JsonParseException("Must specify "+ COMMON_TOKENS + " analyzer");
         }
 
-        return new AnalyzerManager(general,common);
+        return new AnalyzerManager(general, common);
     }
 
     /**
diff --git a/tika-eval/src/main/resources/lucene-analyzers.json b/tika-eval/src/main/resources/lucene-analyzers.json
index fd02fa7..aa24b79 100644
--- a/tika-eval/src/main/resources/lucene-analyzers.json
+++ b/tika-eval/src/main/resources/lucene-analyzers.json
@@ -23,13 +23,6 @@
           "params": {
             "outputUnigrams": "false"
           }
-        },
-        {
-          "factory": "oala.miscellaneous.LimitTokenCountFilterFactory",
-          "params": {
-            "maxTokenCount": "1000000",
-            "consumeAllTokens": "false"
-          }
         }
       ]
     },
diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
index 298c864..887a3e7 100644
--- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
@@ -59,6 +59,12 @@
                 description="EXPERT: prefix for table names for B"/>
         <option opt="drop" hasArg="false" description="drop tables if they exist"/>
         <option opt="maxFilesToAdd" hasArg="true" description="maximum number of files to add to the crawler"/>
+        <option opt="maxTokens" hasArg="true" description="maximum tokens to process, default=200000"/>
+        <option opt="maxContentLength" hasArg="true"
+                description="truncate content beyond this length for calculating 'contents' stats, default=1000000"/>
+        <option opt="maxContentLengthForLangId" hasArg="true"
+                description="truncate content beyond this length for language id, default=50000"/>
+
 
     </commandline>
 
diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
index 9da2aeb..a7e6d03 100644
--- a/tika-eval/src/main/resources/tika-eval-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
@@ -54,6 +54,11 @@
                 description="EXPERT: prefix for table names"/>
         <option opt="drop" hasArg="false" description="drop tables if they exist"/>
         <option opt="maxFilesToAdd" hasArg="true" description="maximum number of files to add to the crawler"/>
+        <option opt="maxTokens" hasArg="true" description="maximum tokens to process, default=200000"/>
+        <option opt="maxContentLength" hasArg="true"
+                description="truncate content beyond this length for calculating 'contents' stats, default=1000000"/>
+        <option opt="maxContentLengthForLangId" hasArg="true"
+                description="truncate content beyond this length for language id, default=50000"/>
 
 
 
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
index 7b27b5d..9caacd7 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
@@ -36,7 +36,7 @@ public class AnalyzerManagerTest {
 
     @Test
     public void testGeneral() throws Exception {
-        AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
+        AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
         Analyzer general = analyzerManager.getGeneralAnalyzer();
         TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog");
         ts.reset();
@@ -57,7 +57,7 @@ public class AnalyzerManagerTest {
 
     @Test
     public void testCommon() throws Exception {
-        AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
+        AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000);
         Analyzer common = analyzerManager.getCommonTokensAnalyzer();
         TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
         ts.reset();
@@ -80,7 +80,7 @@ public class AnalyzerManagerTest {
 
     @Test
     public void testTokenCountFilter() throws Exception {
-        AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
+        AnalyzerManager analyzerManager = AnalyzerManager.newInstance(1000000);
         StringBuilder sb = new StringBuilder();
         for (int i = 0; i < 1001000; i++) {
             sb.append("the ");
@@ -88,7 +88,6 @@ public class AnalyzerManagerTest {
         TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString());
         ts.reset();
         CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
-        Set<String> seen = new HashSet<>();
         int tokens = 0;
         while (ts.incrementToken()) {
             tokens++;
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index 85e91dd..761f961 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -26,6 +26,7 @@ import static org.junit.Assert.assertTrue;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.SortedSet;
@@ -160,20 +161,24 @@ public class SimpleComparerTest extends TikaTest {
     public void testGetContent() throws Exception {
         Metadata m = new Metadata();
         m.add(RecursiveParserWrapper.TIKA_CONTENT, "0123456789");
-
-        String content = getContent(m, 10);
+        Map<Cols, String> data = new HashMap<>();
+        String content = getContent(m, 10, data);
         assertEquals(10, content.length());
+        assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
 
-        content = getContent(m, 4);
+        content = getContent(m, 4, data);
         assertEquals(4, content.length());
+        assertEquals("TRUE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
 
         //test Metadata with no content
-        content = getContent(new Metadata(), 10);
+        content = getContent(new Metadata(), 10, data);
         assertEquals(0, content.length());
+        assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
 
         //test null Metadata
-        content = getContent(null, 10);
+        content = getContent(null, 10, data);
         assertEquals(0, content.length());
+        assertEquals("FALSE", data.get(Cols.CONTENT_TRUNCATED_AT_MAX_LEN));
     }
 
     @Test
@@ -288,4 +293,35 @@ public class SimpleComparerTest extends TikaTest {
             System.out.println(key + " : " + row.get(key));
         }
     }
+
+    @Test
+    @Ignore("useful for testing 2 files not in test set")
+    public void oneOff() throws Exception {
+        Path p1 = Paths.get("");
+        Path p2 = Paths.get("");
+
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file1.pdf.json"),
+                p1
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file1.pdf.json"),
+                p2
+        );
+        comparer.compareFiles(fpsA, fpsB);
+        for (TableInfo t : new TableInfo[]{
+                ExtractComparer.COMPARISON_CONTAINERS,
+                ExtractComparer.EXTRACT_EXCEPTION_TABLE_A,
+                ExtractComparer.EXTRACT_EXCEPTION_TABLE_B,
+                ExtractComparer.EXCEPTION_TABLE_A,
+                ExtractComparer.EXCEPTION_TABLE_B,
+                ExtractComparer.PROFILES_A,
+                ExtractComparer.PROFILES_B,
+                ExtractComparer.CONTENTS_TABLE_A,
+                ExtractComparer.CONTENTS_TABLE_B,
+                ExtractComparer.CONTENT_COMPARISONS}) {
+            debugPrintTable(t);
+        }
+
+    }
 }
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
index 5274dd4..288f042 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
@@ -58,7 +58,7 @@ public class TikaEvalCLITest extends TikaTest {
         compareDBDir = Files.createTempDirectory("tika-eval-cli-compare-db-");
         profileDBDir = Files.createTempDirectory("tika-eval-cli-profile-db-");
         compareReportsDir = Files.createTempDirectory("tika-eval-cli-compare-reports-");
-        profileReportsDir = Files.createTempDirectory("tika-eval-cli-compare-reports-");
+        profileReportsDir = Files.createTempDirectory("tika-eval-cli-profile-reports-");
         compare();
         profile();
         reportCompare();
@@ -115,7 +115,7 @@ public class TikaEvalCLITest extends TikaTest {
                 cnt++;
             }
         }
-        assertTrue(cnt > 5);
+        assertTrue(cnt > 33);
 
     }
 
@@ -127,6 +127,14 @@ public class TikaEvalCLITest extends TikaTest {
         args.add(extractsDir.resolve("extractsA").toAbsolutePath().toString());
         args.add("-extractsB");
         args.add(extractsDir.resolve("extractsB").toAbsolutePath().toString());
+        //add these just to confirm this info doesn't cause problems w cli
+        args.add("-maxTokens");
+        args.add("10000000");
+        args.add("-maxContentLength");
+        args.add("100000000");
+        args.add("-maxContentLengthForLangId");
+        args.add("100000");
+
         args.add("-db");
         args.add(compareDBDir.toAbsolutePath().toString()+"/"+dbName);
 
@@ -139,6 +147,14 @@ public class TikaEvalCLITest extends TikaTest {
         args.add("Profile");
         args.add("-extracts");
         args.add(extractsDir.resolve("extractsA").toAbsolutePath().toString());
+        //add these just to confirm this info doesn't cause problems w cli
+        args.add("-maxTokens");
+        args.add("10000000");
+        args.add("-maxContentLength");
+        args.add("100000000");
+        args.add("-maxContentLengthForLangId");
+        args.add("100000");
+
         args.add("-db");
         args.add(profileDBDir.toAbsolutePath().toString()+"/"+dbName);
         execute(args, 60000);
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
index 40abdaa..9c6325d 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
@@ -41,7 +41,7 @@ public class TokenCounterTest {
 
     @BeforeClass
     public static void setUp() throws IOException {
-        analyzerManager = AnalyzerManager.newInstance();
+        analyzerManager = AnalyzerManager.newInstance(100000);
 
     }
 

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.