You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/12/02 17:04:59 UTC

[tika] branch main updated: TIKA-3241 -- fix more git add problems

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 1219866  TIKA-3241 -- fix more git add problems
1219866 is described below

commit 12198666bf0cf6811613c9ae27c4a384e42875b1
Author: tallison <ta...@apache.org>
AuthorDate: Wed Dec 2 12:04:33 2020 -0500

    TIKA-3241 -- fix more git add problems
---
 .../org/apache/tika/eval/app/AbstractProfiler.java |  821 +++++++++
 .../org/apache/tika/eval/app/EvalFilePaths.java    |  118 ++
 .../org/apache/tika/eval/app/ExtractComparer.java  |  538 ++++++
 .../org/apache/tika/eval/app/ExtractProfiler.java  |  281 ++++
 .../org/apache/tika/eval/app/FileProfiler.java     |  195 +++
 .../java/org/apache/tika/eval/app/TikaEvalCLI.java |  326 ++++
 .../apache/tika/eval/app/XMLErrorLogUpdater.java   |  223 +++
 .../tika/eval/app/batch/DBConsumersManager.java    |  103 ++
 .../tika/eval/app/batch/EvalConsumerBuilder.java   |  240 +++
 .../tika/eval/app/batch/EvalConsumersBuilder.java  |  130 ++
 .../eval/app/batch/ExtractComparerBuilder.java     |  165 ++
 .../eval/app/batch/ExtractProfilerBuilder.java     |  122 ++
 .../tika/eval/app/batch/FileProfilerBuilder.java   |   98 ++
 .../apache/tika/eval/app/db/AbstractDBBuffer.java  |   77 +
 .../java/org/apache/tika/eval/app/db/ColInfo.java  |  127 ++
 .../java/org/apache/tika/eval/app/db/Cols.java     |  115 ++
 .../java/org/apache/tika/eval/app/db/DBBuffer.java |   54 +
 .../java/org/apache/tika/eval/app/db/H2Util.java   |   89 +
 .../java/org/apache/tika/eval/app/db/JDBCUtil.java |  316 ++++
 .../org/apache/tika/eval/app/db/MimeBuffer.java    |  147 ++
 .../org/apache/tika/eval/app/db/TableInfo.java     |   73 +
 .../java/org/apache/tika/eval/app/io/DBWriter.java |  162 ++
 .../org/apache/tika/eval/app/io/ExtractReader.java |  243 +++
 .../tika/eval/app/io/ExtractReaderException.java   |   48 +
 .../org/apache/tika/eval/app/io/IDBWriter.java     |   31 +
 .../apache/tika/eval/app/io/XMLLogMsgHandler.java  |   26 +
 .../org/apache/tika/eval/app/io/XMLLogReader.java  |  115 ++
 .../org/apache/tika/eval/app/reports/Report.java   |  198 +++
 .../tika/eval/app/reports/ResultsReporter.java     |  313 ++++
 .../tika/eval/app/reports/XLSXHREFFormatter.java   |   78 +
 .../tika/eval/app/reports/XLSXNumFormatter.java    |   54 +
 .../tika/eval/app/reports/XSLXCellFormatter.java   |   30 +
 .../eval/app/tools/BatchTopCommonTokenCounter.java |   54 +
 .../eval/app/tools/CommonTokenOverlapCounter.java  |   69 +
 .../apache/tika/eval/app/tools/LeipzigHelper.java  |   57 +
 .../apache/tika/eval/app/tools/LeipzigSampler.java |   76 +
 .../eval/app/tools/SlowCompositeReaderWrapper.java |  327 ++++
 .../tika/eval/app/tools/TopCommonTokenCounter.java |  401 +++++
 .../apache/tika/eval/app/tools/TrainTestSplit.java |  114 ++
 .../src/main/resources/comparison-reports-pg.xml   | 1753 ++++++++++++++++++++
 .../src/main/resources/comparison-reports.xml      | 1753 ++++++++++++++++++++
 .../tika-eval-app/src/main/resources/db.properties |   23 +
 .../src/main/resources/profile-reports.xml         |  329 ++++
 .../main/resources/tika-eval-comparison-config.xml |  102 ++
 .../resources/tika-eval-file-profiler-config.xml   |   74 +
 .../main/resources/tika-eval-profiler-config.xml   |   96 ++
 .../apache/tika/eval/app/AnalyzerManagerTest.java  |  100 ++
 .../apache/tika/eval/app/ComparerBatchTest.java    |  399 +++++
 .../org/apache/tika/eval/app/MockDBWriter.java     |   83 +
 .../apache/tika/eval/app/ProfilerBatchTest.java    |  234 +++
 .../apache/tika/eval/app/SimpleComparerTest.java   |  439 +++++
 .../org/apache/tika/eval/app/TikaEvalCLITest.java  |  293 ++++
 .../tika/eval/app/db/AbstractBufferTest.java       |  160 ++
 .../apache/tika/eval/app/io/ExtractReaderTest.java |   86 +
 .../tika/eval/app/io/FatalExceptionReaderTest.java |   33 +
 .../tika/eval/app/reports/ResultsReporterTest.java |   60 +
 .../eval/app/tools/TopCommonTokenCounterTest.java  |   94 ++
 .../src/test/resources/common_tokens/en            |   28 +
 .../src/test/resources/common_tokens/es            |   30 +
 .../src/test/resources/common_tokens/zh-cn         |   28 +
 .../src/test/resources/common_tokens/zh-tw         |   28 +
 .../single-file-profiler-crawl-extract-config.xml  |   76 +
 .../single-file-profiler-crawl-input-config.xml    |   77 +
 .../test-dirs/batch-logs/batch-process-fatal.xml   |   59 +
 .../resources/test-dirs/extractsA/file1.pdf.json   |    5 +
 .../test-dirs/extractsA/file10_permahang.txt.json  |    0
 .../test-dirs/extractsA/file11_oom.txt.json        |    0
 .../test-dirs/extractsA/file12_es.txt.json         |    4 +
 .../extractsA/file13_attachANotB.doc.json          |   11 +
 .../extractsA/file14_diffAttachOrder.json          |   19 +
 .../resources/test-dirs/extractsA/file15_tags.json |   41 +
 .../test-dirs/extractsA/file16_badTags.json        |   41 +
 .../test-dirs/extractsA/file17_tagsOutOfOrder.json |   41 +
 .../test-dirs/extractsA/file2_attachANotB.doc.json |   10 +
 .../test-dirs/extractsA/file3_attachBNotA.doc.json |    4 +
 .../test-dirs/extractsA/file4_emptyB.pdf.json      |    4 +
 .../test-dirs/extractsA/file5_emptyA.pdf.json      |    0
 .../test-dirs/extractsA/file6_accessEx.pdf.json    |    1 +
 .../test-dirs/extractsA/file7_badJson.pdf.json     |    4 +
 .../test-dirs/extractsA/file8_IOEx.pdf.json        |    1 +
 .../resources/test-dirs/extractsB/file1.pdf.json   |    2 +
 .../test-dirs/extractsB/file11_oom.txt.json        |    0
 .../test-dirs/extractsB/file12_es.txt.json         |    4 +
 .../test-dirs/extractsB/file13_attachANotB.doc.txt |    1 +
 .../extractsB/file14_diffAttachOrder.json          |   19 +
 .../resources/test-dirs/extractsB/file15_tags.html |   31 +
 .../test-dirs/extractsB/file16_badTags.html        |   31 +
 .../test-dirs/extractsB/file2_attachANotB.doc.json |    4 +
 .../test-dirs/extractsB/file3_attachBNotA.doc.json |   10 +
 .../test-dirs/extractsB/file4_emptyB.pdf.json      |    0
 .../test-dirs/extractsB/file5_emptyA.pdf.json      |    4 +
 .../test-dirs/extractsB/file6_accessEx.pdf.json    |    1 +
 .../test-dirs/extractsB/file7_badJson.pdf.json     |    0
 .../test-dirs/extractsB/file8_IOEx.pdf.json        |    1 +
 .../test/resources/test-dirs/raw_input/file1.pdf   |   13 +
 .../resources/test-dirs/raw_input/file11_oom.txt   |    2 +
 .../test-dirs/raw_input/file2_attachANotB.doc      |   13 +
 .../test-dirs/raw_input/file3_attachBNotA.doc      |   13 +
 .../resources/test-dirs/raw_input/file4_emptyB.pdf |   13 +
 .../resources/test-dirs/raw_input/file5_emptyA.pdf |   13 +
 .../test-dirs/raw_input/file6_accessEx.pdf         |   13 +
 .../test-dirs/raw_input/file7_badJson.pdf          |   13 +
 .../resources/test-dirs/raw_input/file8_IOEx.pdf   |   13 +
 .../test-dirs/raw_input/file9_noextract.txt        |    1 +
 .../parser/iwork/iwana/IWork13PackageParser.java   |  159 ++
 .../apache/tika/parser/microsoft/EMFParser.java    |  168 ++
 .../tika/parser/microsoft/ExcelExtractor.java      |  729 ++++++++
 .../tika/parser/microsoft/FormattingUtils.java     |  104 ++
 .../tika/parser/microsoft/HSLFExtractor.java       |  559 +++++++
 .../tika/parser/microsoft/JackcessParser.java      |  130 ++
 .../apache/tika/parser/microsoft/LinkedCell.java   |   41 +
 .../tika/parser/microsoft/MSOwnerFileParser.java   |   90 +
 .../apache/tika/parser/microsoft/OfficeParser.java |  346 ++++
 .../tika/parser/microsoft/OldExcelParser.java      |   94 ++
 .../tika/parser/microsoft/WordExtractor.java       |  714 ++++++++
 .../tika/parser/microsoft/chm/ChmAssert.java       |  165 ++
 .../tika/parser/microsoft/chm/ChmCommons.java      |  365 ++++
 .../tika/parser/microsoft/chm/ChmConstants.java    |  102 ++
 .../microsoft/chm/ChmDirectoryListingSet.java      |  409 +++++
 .../tika/parser/microsoft/chm/ChmItsfHeader.java   |  484 ++++++
 .../tika/parser/microsoft/chm/ChmPmgiHeader.java   |  168 ++
 .../tika/parser/microsoft/chm/ChmPmglHeader.java   |  196 +++
 .../tika/parser/microsoft/chm/ChmSection.java      |  221 +++
 .../tika/parser/microsoft/chm/ChmWrapper.java      |  140 ++
 .../tika/parser/microsoft/onenote/CompactID.java   |   60 +
 .../parser/microsoft/onenote/ExtendedGUID.java     |   87 +
 .../microsoft/onenote/FileChunkReference.java      |   99 ++
 .../tika/parser/microsoft/onenote/FileNode.java    |  278 ++++
 .../tika/parser/microsoft/onenote/FileNodePtr.java |   65 +
 .../microsoft/onenote/FndStructureConstants.java   |  166 ++
 .../microsoft/onenote/GlobalIdTableEntry2FNDX.java |   40 +
 .../onenote/ObjectDeclarationWithRefCount.java     |   75 +
 .../onenote/ObjectDeclarationWithRefCountBody.java |   73 +
 .../onenote/ObjectInfoDependencyOverrides.java     |   30 +
 .../onenote/ObjectRevisionWithRefCountFNDX.java    |   70 +
 ...ctSpaceObjectStreamOfOIDsOSIDsOrContextIDs.java |   63 +
 .../onenote/OneNoteDirectFileResource.java         |   87 +
 .../onenote/OneNoteLegacyDumpStrings.java          |  163 ++
 .../parser/microsoft/onenote/OneNoteParser.java    |  176 ++
 .../microsoft/onenote/OneNotePropertyId.java       |   86 +
 .../parser/microsoft/onenote/PropertyValue.java    |  137 ++
 .../tika/parser/microsoft/onenote/Revision.java    |   72 +
 .../onenote/RevisionManifestListStart.java         |   30 +
 .../microsoft/onenote/RevisionRoleDeclaration.java |   30 +
 .../microsoft/onenote/RootObjectReference.java     |   40 +
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  304 ++++
 .../tika/parser/microsoft/ooxml/OOXMLParser.java   |  115 ++
 .../ooxml/POIXMLTextExtractorDecorator.java        |   49 +
 .../microsoft/ooxml/ParagraphProperties.java       |   56 +
 .../tika/parser/microsoft/ooxml/RunProperties.java |   73 +
 .../ooxml/XSSFBExcelExtractorDecorator.java        |  174 ++
 .../ooxml/XSSFExcelExtractorDecorator.java         |  604 +++++++
 .../microsoft/ooxml/xps/XPSTextExtractor.java      |   67 +
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java    |  367 ++++
 .../ooxml/xwpf/ml2006/BinaryDataHandler.java       |  120 ++
 .../microsoft/ooxml/xwpf/ml2006/PartHandler.java   |   34 +
 .../microsoft/ooxml/xwpf/ml2006/Relationship.java  |   52 +
 .../ooxml/xwpf/ml2006/Word2006MLDocHandler.java    |  171 ++
 .../tika/parser/microsoft/rtf/ListDescriptor.java  |   35 +
 .../parser/microsoft/rtf/RTFEmbObjHandler.java     |  251 +++
 .../parser/microsoft/rtf/RTFObjDataParser.java     |  342 ++++
 .../microsoft/xml/AbstractXML2003Parser.java       |  136 ++
 .../parser/microsoft/xml/HyperlinkHandler.java     |   96 ++
 .../tika/parser/microsoft/xml/WordMLParser.java    |  309 ++++
 .../services/org.apache.tika.parser.Parser         |   29 +
 165 files changed, 24287 insertions(+)

diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
new file mode 100644
index 0000000..910fe09
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
@@ -0,0 +1,821 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app;
+
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Types;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.eval.core.langid.LanguageIDWrapper;
+import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator;
+import org.apache.tika.eval.core.textstats.CommonTokens;
+import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator;
+import org.apache.tika.eval.core.textstats.ContentLengthCalculator;
+import org.apache.tika.eval.core.textstats.TextStatsCalculator;
+import org.apache.tika.eval.core.textstats.TokenEntropy;
+import org.apache.tika.eval.core.textstats.TokenLengths;
+import org.apache.tika.eval.core.textstats.TopNTokens;
+import org.apache.tika.eval.core.textstats.UnicodeBlockCounter;
+import org.apache.tika.eval.core.tokens.AnalyzerManager;
+import org.apache.tika.eval.core.tokens.CommonTokenCountManager;
+import org.apache.tika.eval.core.tokens.CommonTokenResult;
+import org.apache.tika.eval.core.tokens.TokenCounts;
+import org.apache.tika.eval.core.tokens.TokenIntPair;
+import org.apache.tika.eval.core.util.ContentTagParser;
+import org.apache.tika.eval.core.util.ContentTags;
+import org.apache.tika.eval.core.util.EvalExceptionUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+public abstract class AbstractProfiler extends FileResourceConsumer {
+
+    private static final Logger LOG = LoggerFactory.getLogger(AbstractProfiler.class);
+
+    private static final String[] EXTRACT_EXTENSIONS = {
+            ".json",
+            ".txt",
+            ""
+    };
+
+    private static final String[] COMPRESSION_EXTENSIONS = {
+            "",
+            ".bz2",
+            ".gzip",
+            ".zip",
+    };
+    static final long NON_EXISTENT_FILE_LENGTH = -1l;
+
+    public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new TableInfo("ref_extract_exception_types",
+            new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER),
+            new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
+    );
+
+
+    public static TableInfo REF_PARSE_ERROR_TYPES = new TableInfo("ref_parse_error_types",
+            new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER),
+            new ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128)
+    );
+
+    public static TableInfo REF_PARSE_EXCEPTION_TYPES = new TableInfo("ref_parse_exception_types",
+            new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER),
+            new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
+    );
+
+    public static final String TRUE = Boolean.toString(true);
+    public static final String FALSE = Boolean.toString(false);
+    private static final String ZERO = "0";
+
+
+    protected static final AtomicInteger ID = new AtomicInteger();
+
+    private static final String UNKNOWN_EXTENSION = "unk";
+    //make this configurable
+    private static final String DIGEST_KEY = "X-TIKA:digest:MD5";
+
+    private static final Map<String, Cols> UC_TAGS_OF_INTEREST = initTags();
+
+    private static Map<String, Cols> initTags() {
+        //simplify this mess
+        Map<String, Cols> tmp = new HashMap<>();
+        tmp.put("A", Cols.TAGS_A);
+        tmp.put("B", Cols.TAGS_B);
+        tmp.put("DIV", Cols.TAGS_DIV);
+        tmp.put("I", Cols.TAGS_I);
+        tmp.put("IMG", Cols.TAGS_IMG);
+        tmp.put("LI", Cols.TAGS_LI);
+        tmp.put("OL", Cols.TAGS_OL);
+        tmp.put("P", Cols.TAGS_P);
+        tmp.put("TABLE", Cols.TAGS_TABLE);
+        tmp.put("TD", Cols.TAGS_TD);
+        tmp.put("TITLE", Cols.TAGS_TITLE);
+        tmp.put("TR", Cols.TAGS_TR);
+        tmp.put("U", Cols.TAGS_U);
+        tmp.put("UL", Cols.TAGS_UL);
+        return Collections.unmodifiableMap(tmp);
+    }
+
+    private static CommonTokenCountManager COMMON_TOKEN_COUNT_MANAGER;
+
+    private String lastExtractExtension = null;
+
+    AnalyzerManager analyzerManager;
+
+
+    public enum EXCEPTION_TYPE {
+        RUNTIME,
+        ENCRYPTION,
+        ACCESS_PERMISSION,
+        UNSUPPORTED_VERSION,
+    }
+
+    /**
+     * If information was gathered from the log file about
+     * a parse error
+     */
+    public enum PARSE_ERROR_TYPE {
+        OOM,
+        TIMEOUT
+    }
+
+    public static TableInfo MIME_TABLE = new TableInfo("mimes",
+            new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
+            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
+    );
+
+    private static Pattern FILE_NAME_CLEANER = Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
+
+
+    final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for file_path
+    int maxContentLength = 10000000;
+    int maxContentLengthForLangId = 50000;
+    int maxTokens = 200000;
+
+
+
+    private final static Pattern ACCESS_PERMISSION_EXCEPTION =
+            Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException");
+    private final static Pattern ENCRYPTION_EXCEPTION =
+            Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException");
+
+    private static LanguageIDWrapper LANG_ID = new LanguageIDWrapper();
+
+    //TODO: allow configuration
+    //private TikaConfig config = TikaConfig.getDefaultConfig();
+    CompositeTextStatsCalculator compositeTextStatsCalculator;
+    protected IDBWriter writer;
+
+    /**
+     * @param p path to the common_tokens directory.  If this is null, try to load from classPath
+     * @param defaultLangCode this is the language code to use if a common_words list doesn't exist for the
+     *                        detected langauge; can be <code>null</code>
+     * @throws IOException
+     */
+    public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException {
+        COMMON_TOKEN_COUNT_MANAGER = new CommonTokenCountManager(p, defaultLangCode);
+    }
+
+    public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue,
+                            IDBWriter writer) {
+        super(fileQueue);
+        this.writer = writer;
+        LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
+        this.compositeTextStatsCalculator = initAnalyzersAndTokenCounter(maxTokens, LANG_ID);
+    }
+
+    private CompositeTextStatsCalculator initAnalyzersAndTokenCounter(int maxTokens, LanguageIDWrapper langIder) {
+        analyzerManager = AnalyzerManager.newInstance(maxTokens);
+        List<TextStatsCalculator> calculators = new ArrayList<>();
+        calculators.add(new CommonTokens(COMMON_TOKEN_COUNT_MANAGER));
+        calculators.add(new TokenEntropy());
+        calculators.add(new TokenLengths());
+        calculators.add(new TopNTokens(10));
+        calculators.add(new BasicTokenCountStatsCalculator());
+        calculators.add(new ContentLengthCalculator());
+        calculators.add(new UnicodeBlockCounter(maxContentLengthForLangId));
+
+        return new CompositeTextStatsCalculator(calculators, analyzerManager.getGeneralAnalyzer(), langIder);
+    }
+
+    /**
+     * Truncate the content string if greater than this length to this length
+     *
+     * @param maxContentLength
+     */
+    public void setMaxContentLength(int maxContentLength) {
+        this.maxContentLength = maxContentLength;
+    }
+
+    /**
+     * Truncate content string if greater than this length to this length for lang id
+     *
+     * @param maxContentLengthForLangId
+     */
+    public void setMaxContentLengthForLangId(int maxContentLengthForLangId) {
+        this.maxContentLengthForLangId = maxContentLengthForLangId;
+        LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
+    }
+
+    /**
+     * Add a LimitTokenCountFilterFactory if &gt; -1
+     *
+     * @param maxTokens
+     */
+    public void setMaxTokens(int maxTokens) {
+        this.maxTokens = maxTokens;
+        initAnalyzersAndTokenCounter(maxTokens, new LanguageIDWrapper());
+    }
+
+
+    protected void writeExtractException(TableInfo extractExceptionTable, String containerId,
+                                         String filePath, ExtractReaderException.TYPE type) throws IOException {
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.CONTAINER_ID, containerId);
+        data.put(Cols.FILE_PATH, filePath);
+        data.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(type.ordinal()));
+        writer.writeRow(extractExceptionTable, data);
+
+    }
+
+    protected void writeProfileData(EvalFilePaths fps, int i,
+                                    ContentTags contentTags, Metadata m,
+                                    String fileId, String containerId,
+                                    List<Integer> numAttachments, TableInfo profileTable) {
+
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+        data.put(Cols.CONTAINER_ID, containerId);
+        data.put(Cols.MD5, m.get(DIGEST_KEY));
+
+        if (i < numAttachments.size()) {
+            data.put(Cols.NUM_ATTACHMENTS, Integer.toString(numAttachments.get(i)));
+        }
+        data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
+        data.put(Cols.NUM_METADATA_VALUES,
+                Integer.toString(countMetadataValues(m)));
+
+        Integer nPages = m.getInt(PagedText.N_PAGES);
+        if (nPages != null) {
+            data.put(Cols.NUM_PAGES, Integer.toString(nPages));
+        }
+
+        //if the outer wrapper document
+        if (i == 0) {
+            data.put(Cols.IS_EMBEDDED, FALSE);
+            data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString());
+        } else {
+            data.put(Cols.IS_EMBEDDED, TRUE);
+            data.put(Cols.FILE_NAME, getFileName(m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)));
+        }
+        String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
+        ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
+        data.put(Cols.FILE_EXTENSION, ext);
+        long srcFileLen = getSourceFileLength(m);
+        if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
+            data.put(Cols.LENGTH, Long.toString(srcFileLen));
+        } else {
+            data.put(Cols.LENGTH, "");
+        }
+        int numMetadataValues = countMetadataValues(m);
+        data.put(Cols.NUM_METADATA_VALUES,
+                Integer.toString(numMetadataValues));
+
+        data.put(Cols.ELAPSED_TIME_MILLIS,
+                getTime(m));
+
+        String content = contentTags.getContent();
+        if (content == null || content.trim().length() == 0) {
+            data.put(Cols.HAS_CONTENT, FALSE);
+        } else {
+            data.put(Cols.HAS_CONTENT, TRUE);
+        }
+        getFileTypes(m, data);
+        try {
+            writer.writeRow(profileTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private static String getFileName(String path) {
+        if (path == null) {
+            return "";
+        }
+        //filenameUtils checks for a null byte in the path.
+        //it will throw an IllegalArgumentException if there is a null byte.
+        //given that we're recording names and not using them on a file path
+        //we should ignore this.
+        try {
+            return FilenameUtils.getName(path);
+        } catch (IllegalArgumentException e) {
+            LOG.warn("{} in {}", e.getMessage(), path);
+        }
+        path = path.replaceAll("\u0000", " ");
+        try {
+            return FilenameUtils.getName(path);
+        } catch (IllegalArgumentException e) {
+            LOG.warn("Again: {} in {}", e.getMessage(), path);
+        }
+        //give up
+        return "";
+    }
+
+    protected void writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable) {
+        Map<Cols, String> data = new HashMap<>();
+        getExceptionStrings(m, data);
+        if (data.keySet().size() > 0) {
+            try {
+                data.put(Cols.ID, fileId);
+                writer.writeRow(exceptionTable, data);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    protected Map<Class, Object> calcTextStats(ContentTags contentTags) {
+/*        if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) {
+            return Collections.EMPTY_MAP;
+        }*/
+        Map<Cols, String> data = new HashMap<>();
+        String content = truncateContent(contentTags, maxContentLength, data);
+        if (content == null || content.trim().length() == 0) {
+            content = "";
+        }
+        return compositeTextStatsCalculator.calculate(content);
+    }
+
+    /**
+     * Checks to see if metadata is null or content is empty (null or only whitespace).
+     * If any of these, then this does no processing, and the fileId is not
+     * entered into the content table.
+     *
+     * @param fileId
+     * @param textStats
+     * @param contentsTable
+     */
+    protected void writeContentData(String fileId, Map<Class, Object> textStats, TableInfo contentsTable) throws IOException {
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+        if (textStats.containsKey(ContentLengthCalculator.class)) {
+            int length = (int)textStats.get(ContentLengthCalculator.class);
+            if (length == 0) {
+                return;
+            }
+            data.put(Cols.CONTENT_LENGTH, Integer.toString(length));
+        }
+        langid(textStats, data);
+
+        writeTokenCounts(textStats, data);
+        CommonTokenResult commonTokenResult = (CommonTokenResult) textStats.get(CommonTokens.class);
+        if (commonTokenResult != null) {
+            data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
+            data.put(Cols.NUM_UNIQUE_COMMON_TOKENS, Integer.toString(commonTokenResult.getUniqueCommonTokens()));
+            data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
+            data.put(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS,
+                    Integer.toString(commonTokenResult.getUniqueAlphabeticTokens()));
+            data.put(Cols.NUM_ALPHABETIC_TOKENS,
+                    Integer.toString(commonTokenResult.getAlphabeticTokens()));
+        }
+        TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class);
+        if (tokenCounts != null) {
+
+            data.put(Cols.NUM_UNIQUE_TOKENS,
+                    Integer.toString(tokenCounts.getTotalUniqueTokens()));
+            data.put(Cols.NUM_TOKENS,
+                    Integer.toString(tokenCounts.getTotalTokens()));
+        }
+        if (textStats.get(TokenEntropy.class) != null) {
+            data.put(Cols.TOKEN_ENTROPY_RATE,
+                    Double.toString((Double) textStats.get(TokenEntropy.class)));
+        }
+
+        SummaryStatistics summStats = (SummaryStatistics) textStats.get(TokenLengths.class);
+        if (summStats != null) {
+            data.put(Cols.TOKEN_LENGTH_SUM,
+                    Integer.toString((int) summStats.getSum()));
+
+            data.put(Cols.TOKEN_LENGTH_MEAN,
+                    Double.toString(summStats.getMean()));
+
+            data.put(Cols.TOKEN_LENGTH_STD_DEV,
+                    Double.toString(summStats.getStandardDeviation()));
+        }
+        unicodeBlocks(textStats, data);
+        try {
+            writer.writeRow(contentsTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    void writeTagData(String fileId, ContentTags contentTags, TableInfo tagsTable) {
+        Map<String, Integer> tags = contentTags.getTags();
+        if (tags.size() == 0 && contentTags.getParseException() == false) {
+            return;
+        }
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+
+        for (Map.Entry<String, Cols> e : UC_TAGS_OF_INTEREST.entrySet()) {
+            Integer count = tags.get(e.getKey());
+            if (count == null) {
+                data.put(e.getValue(), ZERO);
+            } else {
+                data.put(e.getValue(), Integer.toString(count));
+            }
+        }
+
+        if (contentTags.getParseException()) {
+            data.put(Cols.TAGS_PARSE_EXCEPTION, TRUE);
+        } else {
+            data.put(Cols.TAGS_PARSE_EXCEPTION, FALSE);
+        }
+        try {
+            writer.writeRow(tagsTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+
+    String getTime(Metadata m) {
+        String elapsed = "-1";
+
+        String v = m.get(AbstractRecursiveParserWrapperHandler.PARSE_TIME_MILLIS);
+        if (v != null) {
+            return v;
+        }
+        return elapsed;
+    }
+
+    int countMetadataValues(Metadata m) {
+        if (m == null) {
+            return 0;
+        }
+        int i = 0;
+        for (String n : m.names()) {
+            i += m.getValues(n).length;
+        }
+        return i;
+    }
+
+    void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {
+
+        String fullTrace = metadata.get(RecursiveParserWrapperHandler.CONTAINER_EXCEPTION);
+
+        if (fullTrace == null) {
+            fullTrace = metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION);
+        }
+
+        if (fullTrace != null) {
+            //check for "expected" exceptions...exceptions
+            //that can't be fixed.
+            //Do not store trace for "expected" exceptions
+
+            Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace);
+            if (matcher.find()) {
+                data.put(Cols.PARSE_EXCEPTION_ID,
+                        Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()));
+                return;
+            }
+            matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace);
+            if (matcher.find()) {
+                data.put(Cols.PARSE_EXCEPTION_ID,
+                        Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal()));
+                return;
+            }
+
+            data.put(Cols.PARSE_EXCEPTION_ID,
+                    Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal()));
+
+            data.put(Cols.ORIG_STACK_TRACE, fullTrace);
+            //TikaExceptions can have object ids, as in the "@2b1ea6ee" in:
+            //org.apache.tika.exception.TikaException: TIKA-198: Illegal
+            //IOException from org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee
+            //For reporting purposes, let's snip off the object id so that we can more
+            //easily count exceptions.
+            String sortTrace = EvalExceptionUtils.normalize(fullTrace);
+            data.put(Cols.SORT_STACK_TRACE, sortTrace);
+        }
+    }
+
+    /**
+     * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
+     *
+     * @param contentTags
+     * @param maxLength
+     * @param data
+     * @return
+     */
+    protected static String truncateContent(ContentTags contentTags, int maxLength, Map<Cols, String> data) {
+        data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
+        if (contentTags == null) {
+            return "";
+        }
+        String c = contentTags.getContent();
+        if (maxLength > -1 && c.length() > maxLength) {
+            c = c.substring(0, maxLength);
+            data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
+        }
+        return c;
+
+    }
+
+    protected static ContentTags getContent(EvalFilePaths evalFilePaths, Metadata metadata) {
+        if (metadata == null) {
+            return ContentTags.EMPTY_CONTENT_TAGS;
+        }
+        return parseContentAndTags(evalFilePaths, metadata);
+    }
+
+    void unicodeBlocks(Map<Class, Object> tokenStats, Map<Cols, String> data) {
+
+        Map<String, MutableInt> blocks = (Map<String, MutableInt>) tokenStats.get(UnicodeBlockCounter.class);
+        List<Pair<String, Integer>> pairs = new ArrayList<>();
+        for (Map.Entry<String, MutableInt> e : blocks.entrySet()) {
+            pairs.add(Pair.of(e.getKey(), e.getValue().intValue()));
+        }
+        pairs.sort((o1, o2) -> o2.getValue().compareTo(o1.getValue()));
+        StringBuilder sb = new StringBuilder();
+
+        for (int i = 0; i < 20 && i < pairs.size(); i++) {
+            if (i > 0) {
+                sb.append(" | ");
+            }
+            sb.append(pairs.get(i).getKey() + ": " + pairs.get(i).getValue());
+        }
+        data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
+    }
+
+    void langid(Map<Class, Object> stats, Map<Cols, String> data) {
+        List<LanguageResult> probabilities = (List<LanguageResult>) stats.get(LanguageIDWrapper.class);
+
+        if (probabilities.size() > 0) {
+            data.put(Cols.LANG_ID_1, probabilities.get(0).getLanguage());
+            data.put(Cols.LANG_ID_PROB_1,
+                    Double.toString(probabilities.get(0).getRawScore()));
+        }
+        if (probabilities.size() > 1) {
+            data.put(Cols.LANG_ID_2, probabilities.get(1).getLanguage());
+            data.put(Cols.LANG_ID_PROB_2,
+                    Double.toString(probabilities.get(1).getRawScore()));
+        }
+    }
+
+    void getFileTypes(Metadata metadata, Map<Cols, String> output) {
+        if (metadata == null) {
+            return;
+        }
+        String type = metadata.get(Metadata.CONTENT_TYPE);
+        if (type == null) {
+            return;
+        }
+        int mimeId = writer.getMimeId(type);
+        output.put(Cols.MIME_ID, Integer.toString(mimeId));
+    }
+
+    void writeTokenCounts(Map<Class, Object> textStats, Map<Cols, String> data) {
+        TokenIntPair[] tokenIntPairs = (TokenIntPair[]) textStats.get(TopNTokens.class);
+        int i = 0;
+        StringBuilder sb = new StringBuilder();
+        for (TokenIntPair t : tokenIntPairs) {
+            if (i++ > 0) {
+                sb.append(" | ");
+            }
+            sb.append(t.getToken() + ": " + t.getValue());
+        }
+
+        data.put(Cols.TOP_N_TOKENS, sb.toString());
+    }
+
+
+    public void closeWriter() throws IOException {
+        writer.close();
+    }
+
+
+    /**
+     * @param metadata
+     * @param extracts
+     * @return evalfilepaths for files if crawling an extract directory
+     */
+    protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata,
+                                                     Path extracts) {
+        String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
+        Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
+        Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
+        //just try slapping the relextractfilepath on the extractdir
+        Path extractFile = extracts.resolve(relExtractFilePath);
+        if (!Files.isRegularFile(extractFile)) {
+            //if that doesn't work, try to find the right extract file.
+            //This is necessary if crawling extractsA and trying to find a file in
+            //extractsB that is not in the same format: json vs txt or compressed
+            extractFile = findFile(extracts, relativeSourceFilePath);
+        }
+        return new EvalFilePaths(relativeSourceFilePath, extractFile);
+    }
+
+    //call this if the crawler is crawling through the src directory
+    protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir,
+                                                 Path extracts) {
+        Path relativeSourceFilePath = Paths.get(metadata.get(FSProperties.FS_REL_PATH));
+        Path extractFile = findFile(extracts, relativeSourceFilePath);
+        Path inputFile = srcDir.resolve(relativeSourceFilePath);
+        long srcLen = -1l;
+        //try to get the length of the source file in case there was an error
+        //in both extracts
+        try {
+            srcLen = Files.size(inputFile);
+        } catch (IOException e) {
+            LOG.warn("Couldn't get length for: {}", inputFile.toAbsolutePath());
+        }
+        return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen);
+    }
+
+    /**
+     * @param extractRootDir
+     * @param relativeSourceFilePath
+     * @return extractFile or null if couldn't find one.
+     */
+    private Path findFile(Path extractRootDir, Path relativeSourceFilePath) {
+        String relSrcFilePathString = relativeSourceFilePath.toString();
+        if (lastExtractExtension != null) {
+            Path candidate = extractRootDir.resolve(relSrcFilePathString + lastExtractExtension);
+            if (Files.isRegularFile(candidate)) {
+                return candidate;
+            }
+        }
+        for (String ext : EXTRACT_EXTENSIONS) {
+            for (String compress : COMPRESSION_EXTENSIONS) {
+                Path candidate = extractRootDir.resolve(relSrcFilePathString + ext + compress);
+                if (Files.isRegularFile(candidate)) {
+                    lastExtractExtension = ext + compress;
+                    return candidate;
+                }
+            }
+        }
+        return null;
+    }
+
+    protected long getSourceFileLength(EvalFilePaths fps, List<Metadata> metadataList) {
+        if (fps.getSourceFileLength() > NON_EXISTENT_FILE_LENGTH) {
+            return fps.getSourceFileLength();
+        }
+        return getSourceFileLength(metadataList);
+    }
+
+    long getSourceFileLength(List<Metadata> metadataList) {
+        if (metadataList == null || metadataList.size() < 1) {
+            return NON_EXISTENT_FILE_LENGTH;
+        }
+        return getSourceFileLength(metadataList.get(0));
+    }
+
+    long getSourceFileLength(Metadata m) {
+        String lenString = m.get(Metadata.CONTENT_LENGTH);
+        if (lenString == null) {
+            return NON_EXISTENT_FILE_LENGTH;
+        }
+        try {
+            return Long.parseLong(lenString);
+        } catch (NumberFormatException e) {
+            //swallow
+        }
+        return NON_EXISTENT_FILE_LENGTH;
+    }
+
+    protected long getFileLength(Path p) {
+        if (p != null && Files.isRegularFile(p)) {
+            try {
+                return Files.size(p);
+            } catch (IOException e) {
+                //swallow
+            }
+        }
+        return NON_EXISTENT_FILE_LENGTH;
+    }
+
+    /**
+     * @param list
+     * @return empty list if input list is empty or null
+     */
+    static List<Integer> countAttachments(List<Metadata> list) {
+        List<Integer> ret = new ArrayList<>();
+        if (list == null || list.size() == 0) {
+            return ret;
+        }
+        //container document attachment count = list.size()-1
+        ret.add(list.size() - 1);
+
+        Map<String, Integer> counts = new HashMap<>();
+        for (int i = 1; i < list.size(); i++) {
+            String path = list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
+            if (path == null) {
+                //shouldn't ever happen
+                continue;
+            }
+            String[] parts = path.split("/");
+            StringBuilder parent = new StringBuilder();
+            for (int end = 1; end < parts.length - 1; end++) {
+                parent.setLength(0);
+                join("/", parent, parts, 1, end);
+                String parentPath = parent.toString();
+                Integer count = counts.get(parentPath);
+                if (count == null) {
+                    count = 1;
+                } else {
+                    count++;
+                }
+                counts.put(parentPath, count);
+            }
+        }
+
+        for (int i = 1; i < list.size(); i++) {
+            Integer count = counts.get(list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
+            if (count == null) {
+                count = 0;
+            }
+            ret.add(i, count);
+        }
+        return ret;
+
+
+    }
+
+    private static void join(String delimiter, StringBuilder sb, String[] parts, int start, int end) {
+        for (int i = start; i <= end; i++) {
+            sb.append(delimiter);
+            sb.append(parts[i]);
+        }
+    }
+
+    private static ContentTags parseContentAndTags(EvalFilePaths evalFilePaths, Metadata metadata) {
+        String s = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+        if (s == null || s.length() == 0) {
+            return ContentTags.EMPTY_CONTENT_TAGS;
+        }
+
+        String handlerClass = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER);
+        if (evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".html")) {
+            try {
+                return ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
+            } catch (IOException | SAXException e) {
+                LOG.warn("Problem parsing html in {}; backing off to treat string as text",
+                        evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
+
+                return new ContentTags(s, true);
+            }
+        } else if (
+                evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".xhtml") ||
+                        (handlerClass != null && handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
+            try {
+                return ContentTagParser.parseXML(s, UC_TAGS_OF_INTEREST.keySet());
+            } catch (TikaException | IOException | SAXException e) {
+                LOG.warn("Problem parsing xhtml in {}; backing off to html parser",
+                        evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
+                try {
+                    ContentTags contentTags = ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
+                    contentTags.setParseException(true);
+                    return contentTags;
+                } catch (IOException | SAXException e2) {
+                    LOG.warn("Problem parsing html in {}; backing off to treat string as text",
+                            evalFilePaths.getExtractFile().toAbsolutePath().toString(), e2);
+                }
+                return new ContentTags(s, true);
+            }
+        }
+        return new ContentTags(s);
+    }
+
+
+}
+
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalFilePaths.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalFilePaths.java
new file mode 100644
index 0000000..e05de44
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalFilePaths.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app;
+
+import static org.apache.tika.eval.app.AbstractProfiler.NON_EXISTENT_FILE_LENGTH;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Objects;
+
+/**
+ * Simple struct to keep track of relative path of source file (
+ * original binary file, e.g. /subdir/document1.doc)
+ * and the extract file (e.g. /subdir/document1.doc.json).
+ */
+class EvalFilePaths {
+
+    private final Path relativeSourceFilePath;
+    private final Path extractFile;
+
+    private long sourceFileLength = NON_EXISTENT_FILE_LENGTH;
+    private long extractFileLength = NON_EXISTENT_FILE_LENGTH;
+
+
+    public EvalFilePaths(Path relativeSourceFilePath, Path extractFile, long srcFileLen) {
+        this(relativeSourceFilePath, extractFile);
+        this.sourceFileLength = srcFileLen;
+    }
+
+    public EvalFilePaths(Path relativeSourceFilePath, Path extractFile) {
+        if (extractFile != null && Files.isRegularFile(extractFile)) {
+            try {
+                extractFileLength = Files.size(extractFile);
+            } catch (IOException e) {
+                //swallow ?
+            }
+        }
+        this.relativeSourceFilePath = relativeSourceFilePath;
+        this.extractFile = extractFile;
+    }
+
+    public Path getRelativeSourceFilePath() {
+        return relativeSourceFilePath;
+    }
+
+    //this path may or may not exist and it could be null!
+    public Path getExtractFile() {
+        return extractFile;
+    }
+
+    //if it doesn't exist, it'll be -1l.
+    public long getSourceFileLength() {
+        return sourceFileLength;
+    }
+
+    public long getExtractFileLength() {
+        return extractFileLength;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+
+        EvalFilePaths that = (EvalFilePaths) o;
+
+        if (sourceFileLength != that.sourceFileLength) {
+            return false;
+        }
+        if (extractFileLength != that.extractFileLength) {
+            return false;
+        }
+        if (!Objects.equals(relativeSourceFilePath, that.relativeSourceFilePath)) {
+            return false;
+        }
+        return Objects.equals(extractFile, that.extractFile);
+
+    }
+
+    @Override
+    public int hashCode() {
+        int result = relativeSourceFilePath != null ? relativeSourceFilePath.hashCode() : 0;
+        result = 31 * result + (extractFile != null ? extractFile.hashCode() : 0);
+        result = 31 * result + (int) (sourceFileLength ^ (sourceFileLength >>> 32));
+        result = 31 * result + (int) (extractFileLength ^ (extractFileLength >>> 32));
+        return result;
+    }
+
+    @Override
+    public String toString() {
+        return "EvalFilePaths{" +
+                "relativeSourceFilePath=" + relativeSourceFilePath +
+                ", extractFile=" + extractFile +
+                ", sourceFileLength=" + sourceFileLength +
+                ", extractFileLength=" + extractFileLength +
+                '}';
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
new file mode 100644
index 0000000..4b827f7
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
@@ -0,0 +1,538 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.ExtractReader;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator;
+import org.apache.tika.eval.core.tokens.ContrastStatistics;
+import org.apache.tika.eval.core.tokens.TokenContraster;
+import org.apache.tika.eval.core.tokens.TokenCounts;
+import org.apache.tika.eval.core.tokens.TokenIntPair;
+import org.apache.tika.eval.core.util.ContentTags;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+
+public class ExtractComparer extends AbstractProfiler {
+
+    static Options OPTIONS;
+    static {
+        Option extractsA = new Option("extractsA", true, "directory for extractsA files");
+        extractsA.setRequired(true);
+
+        Option extractsB = new Option("extractsB", true, "directory for extractsB files");
+        extractsB.setRequired(true);
+
+        Option inputDir = new Option("inputDir", true,
+                "optional: directory of original binary input files if it exists " +
+                        "or can be the same as -extractsA or -extractsB. If not specified, -inputDir=-extractsA");
+
+
+        OPTIONS = new Options()
+                .addOption(extractsA)
+                .addOption(extractsB)
+                .addOption(inputDir)
+                .addOption("bc", "optional: tika-batch config file")
+                .addOption("numConsumers", true, "optional: number of consumer threads")
+                .addOption(new Option("alterExtract", true,
+                        "for json-formatted extract files, " +
+                                "process full metadata list ('as_is'=default), " +
+                                "take just the first/container document ('first_only'), " +
+                                "concatenate all content into the first metadata item ('concatenate_content')"))
+                .addOption("minExtractLength", true, "minimum extract length to process (in bytes)")
+                .addOption("maxExtractLength", true, "maximum extract length to process (in bytes)")
+                .addOption("db", true, "db file to which to write results")
+                .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
+                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
+                .addOption("tablePrefixA", true, "EXPERT: optional prefix for table names for A")
+                .addOption("tablePrefixB", true, "EXPERT: optional prefix for table names for B")
+                .addOption("drop", false, "drop tables if they exist")
+                .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
+                .addOption("maxTokens", true, "maximum tokens to process, default=200000")
+                .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
+                .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
+                .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")
+        ;
+    }
+
+    private static final String DIGEST_KEY_PREFIX = TikaCoreProperties.TIKA_META_PREFIX+
+            "digest"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+    public static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(
+                80,
+                "java -jar tika-eval-x.y.jar Compare -extractsA extractsA -extractsB extractsB -db mydb",
+                "Tool: Compare",
+                ExtractComparer.OPTIONS,
+                "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
+    }
+
+    private final static String FIELD_A = "fa";
+    private final static String FIELD_B = "fb";
+
+    public static TableInfo REF_PAIR_NAMES = new TableInfo("pair_names",
+            new ColInfo(Cols.DIR_NAME_A, Types.VARCHAR, 128),
+            new ColInfo(Cols.DIR_NAME_B, Types.VARCHAR, 128)
+    );
+
+    public static TableInfo COMPARISON_CONTAINERS = new TableInfo("containers",
+            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
+            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
+            new ColInfo(Cols.LENGTH, Types.BIGINT),
+            new ColInfo(Cols.EXTRACT_FILE_LENGTH_A, Types.BIGINT),
+            new ColInfo(Cols.EXTRACT_FILE_LENGTH_B, Types.BIGINT)
+    );
+
+    public static TableInfo CONTENT_COMPARISONS = new TableInfo("content_comparisons",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, Types.VARCHAR, 1024),
+            new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, Types.VARCHAR, 1024),
+            new ColInfo(Cols.TOP_10_MORE_IN_A, Types.VARCHAR, 1024),
+            new ColInfo(Cols.TOP_10_MORE_IN_B, Types.VARCHAR, 1024),
+            new ColInfo(Cols.DICE_COEFFICIENT, Types.FLOAT),
+            new ColInfo(Cols.OVERLAP, Types.FLOAT)
+    );
+
+    public static TableInfo PROFILES_A = new TableInfo( "profiles_a",
+            ExtractProfiler.PROFILE_TABLE.getColInfos());
+
+    public static TableInfo PROFILES_B = new TableInfo( "profiles_b",
+            ExtractProfiler.PROFILE_TABLE.getColInfos());
+
+    public static TableInfo EMBEDDED_FILE_PATH_TABLE_A = new TableInfo( "emb_path_a",
+            ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
+
+    public static TableInfo EMBEDDED_FILE_PATH_TABLE_B = new TableInfo( "emb_path_b",
+            ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
+
+
+    public static TableInfo CONTENTS_TABLE_A = new TableInfo( "contents_a",
+            ExtractProfiler.CONTENTS_TABLE.getColInfos());
+
+    public static TableInfo CONTENTS_TABLE_B = new TableInfo( "contents_b",
+            ExtractProfiler.CONTENTS_TABLE.getColInfos());
+
+    public static TableInfo TAGS_TABLE_A = new TableInfo( "tags_a",
+            ExtractProfiler.TAGS_TABLE.getColInfos());
+
+    public static TableInfo TAGS_TABLE_B = new TableInfo( "tags_b",
+            ExtractProfiler.TAGS_TABLE.getColInfos());
+
+
+    public static TableInfo EXCEPTION_TABLE_A = new TableInfo ("exceptions_a",
+            ExtractProfiler.EXCEPTION_TABLE.getColInfos());
+
+    public static TableInfo EXCEPTION_TABLE_B = new TableInfo ("exceptions_b",
+            ExtractProfiler.EXCEPTION_TABLE.getColInfos());
+
+    public static TableInfo EXTRACT_EXCEPTION_TABLE_A = new TableInfo("extract_exceptions_a",
+            ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
+    public static TableInfo EXTRACT_EXCEPTION_TABLE_B = new TableInfo("extract_exceptions_b",
+            ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
+
+
+    //need to parameterize?
+    private final TikaConfig config = TikaConfig.getDefaultConfig();
+
+    private final Path inputDir;
+    private final Path extractsA;
+    private final Path extractsB;
+
+    private final TokenContraster tokenContraster = new TokenContraster();
+    private final ExtractReader extractReader;
+
+    public ExtractComparer(ArrayBlockingQueue<FileResource> queue,
+                           Path inputDir, Path extractsA, Path extractsB, ExtractReader extractReader,
+                           IDBWriter writer) {
+        super(queue, writer);
+        this.inputDir = inputDir;
+        this.extractsA = extractsA;
+        this.extractsB = extractsB;
+        this.extractReader = extractReader;
+    }
+
+    @Override
+    public boolean processFileResource(FileResource fileResource) {
+        Metadata metadata = fileResource.getMetadata();
+        EvalFilePaths fpsA = null;
+        EvalFilePaths fpsB = null;
+
+        if (inputDir != null && (inputDir.equals(extractsA) ||
+                inputDir.equals(extractsB))) {
+            //crawling an extract dir
+            fpsA = getPathsFromExtractCrawl(metadata, extractsA);
+            fpsB = getPathsFromExtractCrawl(metadata, extractsB);
+
+        } else {
+            fpsA = getPathsFromSrcCrawl(metadata, inputDir, extractsA);
+            fpsB = getPathsFromSrcCrawl(metadata, inputDir, extractsB);
+        }
+
+        try {
+            compareFiles(fpsA, fpsB);
+        } catch (Throwable e) {
+            //this should be cataclysmic...
+            throw new RuntimeException("Exception while working on: " +
+                    metadata.get(FSProperties.FS_REL_PATH), e);
+        }
+        return true;
+    }
+
+    //protected for testing, should find better way so that this can be private!
+    protected void compareFiles(EvalFilePaths fpsA,
+                                EvalFilePaths fpsB) throws IOException {
+
+        ExtractReaderException.TYPE extractExceptionA = null;
+        ExtractReaderException.TYPE extractExceptionB = null;
+
+        List<Metadata> metadataListA = null;
+        if (extractExceptionA == null) {
+            try {
+                metadataListA = extractReader.loadExtract(fpsA.getExtractFile());
+            } catch (ExtractReaderException e) {
+                extractExceptionA = e.getType();
+            }
+        }
+
+        List<Metadata> metadataListB = null;
+        try {
+            metadataListB = extractReader.loadExtract(fpsB.getExtractFile());
+        } catch (ExtractReaderException e) {
+            extractExceptionB = e.getType();
+        }
+
+        //array indices for those metadata items handled in B
+        Set<Integer> handledB = new HashSet<>();
+        String containerID = Integer.toString(ID.getAndIncrement());
+        //container table
+        Map<Cols, String> contData = new HashMap<>();
+        contData.put(Cols.CONTAINER_ID, containerID);
+        contData.put(Cols.FILE_PATH, fpsA.getRelativeSourceFilePath().toString());
+        long srcFileLength = getSourceFileLength(metadataListA, metadataListB);
+        contData.put(Cols.LENGTH,
+                srcFileLength > NON_EXISTENT_FILE_LENGTH ?
+                    Long.toString(srcFileLength) : "");
+        contData.put(Cols.FILE_EXTENSION,
+                FilenameUtils.getExtension(fpsA.getRelativeSourceFilePath().getFileName().toString()));
+
+        long extractFileLengthA = getFileLength(fpsA.getExtractFile());
+        contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > NON_EXISTENT_FILE_LENGTH ?
+                Long.toString(extractFileLengthA) : "");
+
+        long extractFileLengthB = getFileLength(fpsB.getExtractFile());
+        contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > NON_EXISTENT_FILE_LENGTH ?
+                Long.toString(extractFileLengthB) : "");
+
+        writer.writeRow(COMPARISON_CONTAINERS, contData);
+
+        if (extractExceptionA != null) {
+            writeExtractException(EXTRACT_EXCEPTION_TABLE_A, containerID, fpsA.getRelativeSourceFilePath().toString(),
+                    extractExceptionA);
+        }
+        if (extractExceptionB != null) {
+            writeExtractException(EXTRACT_EXCEPTION_TABLE_B, containerID, fpsB.getRelativeSourceFilePath().toString(),
+                    extractExceptionB);
+        }
+
+        if (metadataListA == null && metadataListB == null) {
+            return;
+        }
+        List<Integer> numAttachmentsA = countAttachments(metadataListA);
+        List<Integer> numAttachmentsB = countAttachments(metadataListB);
+
+        String sharedDigestKey = findSharedDigestKey(metadataListA, metadataListB);
+        Map<Class, Object> tokenStatsA = null;
+        Map<Class, Object> tokenStatsB = null;
+        //now get that metadata
+        if (metadataListA != null) {
+            for (int i = 0; i < metadataListA.size(); i++) {
+                //the first file should have the same id as the container id
+                String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
+                Metadata metadataA = metadataListA.get(i);
+                ContentTags contentTagsA = getContent(fpsA, metadataA);
+                ContentTags contentTagsB = ContentTags.EMPTY_CONTENT_TAGS;
+                Metadata metadataB = null;
+
+                //TODO: shouldn't be fileA!!!!
+                writeTagData(fileId, contentTagsA, TAGS_TABLE_A);
+
+                writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
+                writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
+                int matchIndex = getMatch(i, sharedDigestKey,
+                        handledB, metadataListA, metadataListB);
+
+                if (matchIndex > -1 && ! handledB.contains(matchIndex)) {
+                    metadataB = metadataListB.get(matchIndex);
+                    handledB.add(matchIndex);
+                }
+                if (metadataB != null) {
+                    contentTagsB = getContent(fpsB, metadataB);
+                    writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
+                    writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
+                    writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
+                }
+                writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
+                //write content
+                try {
+                    tokenStatsA = calcTextStats(contentTagsA);
+                    writeContentData(fileId, tokenStatsA, CONTENTS_TABLE_A);
+                    tokenStatsB = calcTextStats(contentTagsB);
+                    if (metadataB != null) {
+                        writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
+                    }
+                } catch (IOException e) {
+                    throw new RuntimeException(e);
+                }
+                if (metadataB != null) {
+                    TokenCounts tokenCountsA = (TokenCounts) tokenStatsA.get(BasicTokenCountStatsCalculator.class);
+                    TokenCounts tokenCountsB = (TokenCounts) tokenStatsB.get(BasicTokenCountStatsCalculator.class);
+                    //arbitrary decision...only run the comparisons if there are > 10 tokens total
+                    //We may want to bump that value a bit higher?
+                    //now run comparisons
+                    if (tokenCountsA.getTotalTokens()
+                            + tokenCountsB.getTotalTokens() > 10) {
+                        Map<Cols, String> data = new HashMap<>();
+                        data.put(Cols.ID, fileId);
+
+                        ContrastStatistics contrastStatistics =
+                                tokenContraster.calculateContrastStatistics(
+                                        tokenCountsA,
+                                        tokenCountsB);
+
+                        writeContrasts(data, contrastStatistics);
+                        writer.writeRow(CONTENT_COMPARISONS, data);
+                    }
+                }
+            }
+        }
+        //now try to get any Metadata objects in B
+        //that haven't yet been handled.
+        if (metadataListB != null) {
+            for (int i = 0; i < metadataListB.size(); i++) {
+                if (handledB.contains(i)) {
+                    continue;
+                }
+                Metadata metadataB = metadataListB.get(i);
+                ContentTags contentTagsB = getContent(fpsB, metadataB);
+                //the first file should have the same id as the container id
+                String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
+                writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
+                writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
+                writeEmbeddedFilePathData(i, fileId, null, metadataB);
+                writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
+
+                //write content
+                try {
+                    tokenStatsB = calcTextStats(contentTagsB);
+                    writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
+                } catch (IOException e) {
+                    throw new RuntimeException(e);
+                }
+            }
+        }
+    }
+
+    /**
+     * Checks only the first item in each list. Returns the first
+     * digest key shared by both, if it exists, null otherwise.
+     * @param metadataListA
+     * @param metadataListB
+     * @return
+     */
+    private String findSharedDigestKey(List<Metadata> metadataListA, List<Metadata> metadataListB) {
+        if (metadataListB == null || metadataListB.size() == 0) {
+            return null;
+        }
+        Set<String> digestA = new HashSet<>();
+        if (metadataListA != null) {
+            for (String n : metadataListA.get(0).names()) {
+                if (n.startsWith(DIGEST_KEY_PREFIX)) {
+                    digestA.add(n);
+                }
+            }
+        }
+        Metadata bMain = metadataListB.get(0);
+        for (String n : bMain.names()) {
+            if (digestA.contains(n)) {
+                return n;
+            }
+        }
+        return null;
+    }
+
+    private void writeEmbeddedFilePathData(int i, String fileId, Metadata mA, Metadata mB) {
+        //container file, don't write anything
+        if (i == 0) {
+            return;
+        }
+        String pathA = null;
+        String pathB = null;
+        if (mA != null) {
+            pathA = mA.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
+        }
+        if (mB != null) {
+            pathB = mB.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
+        }
+        if (pathA != null) {
+            Map<Cols, String> d = new HashMap<>();
+            d.put(Cols.ID, fileId);
+            d.put(Cols.EMBEDDED_FILE_PATH, pathA);
+            try {
+                writer.writeRow(EMBEDDED_FILE_PATH_TABLE_A, d);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+        if (pathB != null &&
+                (pathA == null || ! pathA.equals(pathB))) {
+            Map<Cols, String> d = new HashMap<>();
+            d.put(Cols.ID, fileId);
+            d.put(Cols.EMBEDDED_FILE_PATH, pathB);
+            try {
+                writer.writeRow(EMBEDDED_FILE_PATH_TABLE_B, d);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    private long getSourceFileLength(List<Metadata> metadataListA, List<Metadata> metadataListB) {
+        long len = getSourceFileLength(metadataListA);
+        if (len > NON_EXISTENT_FILE_LENGTH) {
+            return len;
+        }
+        return getSourceFileLength(metadataListB);
+    }
+
+
+    /**
+     * Try to find the matching metadata based on the AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH
+     * If you can't find it, return -1;
+     *
+     * @param aIndex                index for match in metadataListA
+     * @param metadataListA
+     * @param metadataListB
+     * @return
+     */
+    private int getMatch(int aIndex, String sharedDigestKey, Set<Integer> handledB,
+                         List<Metadata> metadataListA,
+                         List<Metadata> metadataListB) {
+        //TODO: could make this more robust
+        if (metadataListB == null || metadataListB.size() == 0) {
+            return -1;
+        }
+        //assume first is always the container file
+        if (aIndex == 0) {
+            return 0;
+        }
+
+        if (sharedDigestKey != null) {
+            //first try to find matching digests
+            //this does not elegantly handle multiple matching digests
+            return findMatchingDigests(sharedDigestKey, handledB,
+                    metadataListA.get(aIndex), metadataListB);
+        }
+
+        //assume same embedded resource path.  Not always true!
+        Metadata thisMetadata = metadataListA.get(aIndex);
+        String embeddedPath = thisMetadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
+        if (embeddedPath != null) {
+            for (int j = 0; j < metadataListB.size(); j++) {
+                String thatEmbeddedPath = metadataListB.get(j).get(
+                        AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
+                if (embeddedPath.equals(thatEmbeddedPath)) {
+                    return j;
+                }
+            }
+        }
+
+        //last resort, if lists are same size, guess the same index
+        if (metadataListA.size() == metadataListB.size()) {
+            //assume no rearrangments if lists are the same size
+            return aIndex;
+        }
+        return -1;
+    }
+
+    private int findMatchingDigests(String sharedDigestKey,
+                                    Set<Integer> handledB,
+                                    Metadata metadata, List<Metadata> metadataListB) {
+        String digestA = metadata.get(sharedDigestKey);
+        if (digestA == null) {
+            return -1;
+        }
+
+        for (int i = 0; i < metadataListB.size(); i++)  {
+            if (handledB.contains(i)) {
+                continue;
+            }
+            Metadata mB = metadataListB.get(i);
+            String digestB = mB.get(sharedDigestKey);
+            if (digestA.equalsIgnoreCase(digestB)) {
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    private void writeContrasts(Map<Cols, String> data, ContrastStatistics contrastStatistics) {
+        writeContrastString(data, Cols.TOP_10_MORE_IN_A, contrastStatistics.getTopNMoreA());
+        writeContrastString(data, Cols.TOP_10_MORE_IN_B, contrastStatistics.getTopNMoreB());
+        writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, contrastStatistics.getTopNUniqueA());
+        writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, contrastStatistics.getTopNUniqueB());
+        data.put(Cols.OVERLAP, Double.toString(contrastStatistics.getOverlap()));
+        data.put(Cols.DICE_COEFFICIENT, Double.toString(contrastStatistics.getDiceCoefficient()));
+
+    }
+
+    private void writeContrastString(Map<Cols, String> data, Cols col, TokenIntPair[] tokenIntPairs) {
+
+        int i = 0;
+        StringBuilder sb = new StringBuilder();
+        for (TokenIntPair p : tokenIntPairs) {
+            if (i++ > 0) {
+                sb.append(" | ");
+            }
+            sb.append(p.getToken()).append(": ").append(p.getValue());
+        }
+        data.put(col, sb.toString());
+    }
+}
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
new file mode 100644
index 0000000..029162e
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -0,0 +1,281 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.ExtractReader;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.eval.core.util.ContentTags;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+
+public class ExtractProfiler extends AbstractProfiler {
+
+    static Options OPTIONS;
+    static {
+        //By the time this commandline is parsed, there should be both an extracts and an inputDir
+        Option extracts = new Option("extracts", true, "directory for extract files");
+        extracts.setRequired(true);
+
+        Option inputDir = new Option("inputDir", true,
+                "optional: directory for original binary input documents."+
+        " If not specified, -extracts is crawled as is.");
+
+        OPTIONS = new Options()
+                .addOption(extracts)
+                .addOption(inputDir)
+                .addOption("bc", "optional: tika-batch config file")
+                .addOption("numConsumers", true, "optional: number of consumer threads")
+                .addOption(new Option("alterExtract", true,
+                        "for json-formatted extract files, " +
+                                "process full metadata list ('as_is'=default), " +
+                                "take just the first/container document ('first_only'), " +
+                                "concatenate all content into the first metadata item ('concatenate_content')"))
+                .addOption("minExtractLength", true, "minimum extract length to process (in bytes)")
+                .addOption("maxExtractLength", true, "maximum extract length to process (in bytes)")
+                .addOption("db", true, "db file to which to write results")
+                .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
+                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
+                .addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
+                .addOption("drop", false, "drop tables if they exist")
+                .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
+                .addOption("maxTokens", true, "maximum tokens to process, default=200000")
+                .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
+                .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
+                .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")
+
+        ;
+
+    }
+
+    public static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(
+                80,
+                "java -jar tika-eval-x.y.jar Profile -extracts extracts -db mydb [-inputDir input]",
+                "Tool: Profile",
+                ExtractProfiler.OPTIONS,
+                "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
+    }
+
+    private final static String FIELD = "f";
+
+    public static TableInfo EXTRACT_EXCEPTION_TABLE = new TableInfo("extract_exceptions",
+            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER),
+            new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
+            new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER),
+            new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER)
+    );
+
+    public static TableInfo EXCEPTION_TABLE = new TableInfo("parse_exceptions",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.ORIG_STACK_TRACE, Types.VARCHAR, 8192),
+            new ColInfo(Cols.SORT_STACK_TRACE, Types.VARCHAR, 8192),
+            new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER)
+    );
+
+
+    public static TableInfo CONTAINER_TABLE = new TableInfo("containers",
+            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
+            new ColInfo(Cols.LENGTH, Types.BIGINT),
+            new ColInfo(Cols.EXTRACT_FILE_LENGTH, Types.BIGINT)
+    );
+
+    public static TableInfo PROFILE_TABLE = new TableInfo("profiles",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER),
+            new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 256),
+            new ColInfo(Cols.MD5, Types.CHAR, 32),
+            new ColInfo(Cols.LENGTH, Types.BIGINT),
+            new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN),
+            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
+            new ColInfo(Cols.MIME_ID, Types.INTEGER),
+            new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER),
+            new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER),
+            new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER),
+            new ColInfo(Cols.NUM_PAGES, Types.INTEGER),
+            new ColInfo(Cols.HAS_CONTENT, Types.BOOLEAN)
+    );
+
+    public static TableInfo EMBEDDED_FILE_PATH_TABLE = new TableInfo("emb_file_names",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024)
+    );
+
+    public static TableInfo CONTENTS_TABLE = new TableInfo("contents",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.CONTENT_LENGTH, Types.INTEGER),
+            new ColInfo(Cols.NUM_UNIQUE_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.NUM_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.COMMON_TOKENS_LANG, Types.VARCHAR, 12),
+            new ColInfo(Cols.NUM_UNIQUE_COMMON_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024),
+            new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12),
+            new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT),
+            new ColInfo(Cols.LANG_ID_2, Types.VARCHAR, 12),
+            new ColInfo(Cols.LANG_ID_PROB_2, Types.FLOAT),
+            new ColInfo(Cols.UNICODE_CHAR_BLOCKS, Types.VARCHAR, 1024),
+            new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT),
+            new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER),
+            new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT),
+            new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT),
+            new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)
+    );
+
+    public static TableInfo TAGS_TABLE = new TableInfo("tags",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.TAGS_A, Types.INTEGER),
+            new ColInfo(Cols.TAGS_B, Types.INTEGER),
+            new ColInfo(Cols.TAGS_DIV, Types.INTEGER),
+            new ColInfo(Cols.TAGS_I, Types.INTEGER),
+            new ColInfo(Cols.TAGS_IMG, Types.INTEGER),
+            new ColInfo(Cols.TAGS_LI, Types.INTEGER),
+            new ColInfo(Cols.TAGS_OL, Types.INTEGER),
+            new ColInfo(Cols.TAGS_P, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TABLE, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TD, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TITLE, Types.INTEGER),
+            new ColInfo(Cols.TAGS_TR, Types.INTEGER),
+            new ColInfo(Cols.TAGS_U, Types.INTEGER),
+            new ColInfo(Cols.TAGS_UL, Types.INTEGER),
+            new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN)
+    );
+
+    private final Path inputDir;
+    private final Path extracts;
+    private final ExtractReader extractReader;
+
+    public ExtractProfiler(ArrayBlockingQueue<FileResource> queue,
+                           Path inputDir, Path extracts,
+                           ExtractReader extractReader, IDBWriter dbWriter) {
+        super(queue, dbWriter);
+        this.inputDir = inputDir;
+        this.extracts = extracts;
+        this.extractReader = extractReader;
+    }
+
+    @Override
+    public boolean processFileResource(FileResource fileResource) {
+        Metadata metadata = fileResource.getMetadata();
+        EvalFilePaths fps = null;
+
+        if (inputDir != null && inputDir.equals(extracts)) {
+            //crawling an extract dir
+            fps = getPathsFromExtractCrawl(metadata, extracts);
+        } else {
+            fps = getPathsFromSrcCrawl(metadata, inputDir, extracts);
+        }
+        int containerId = ID.incrementAndGet();
+        String containerIdString = Integer.toString(containerId);
+
+        ExtractReaderException.TYPE extractExceptionType = null;
+
+        List<Metadata> metadataList = null;
+        try {
+            metadataList = extractReader.loadExtract(fps.getExtractFile());
+        } catch (ExtractReaderException e) {
+            extractExceptionType = e.getType();
+        }
+
+        Map<Cols, String> contOutput = new HashMap<>();
+        Long srcFileLen = getSourceFileLength(fps, metadataList);
+        contOutput.put(Cols.LENGTH,
+                srcFileLen > NON_EXISTENT_FILE_LENGTH ?
+                        Long.toString(srcFileLen): "");
+        contOutput.put(Cols.CONTAINER_ID, containerIdString);
+        contOutput.put(Cols.FILE_PATH, fps.getRelativeSourceFilePath().toString());
+
+        if (fps.getExtractFileLength() > 0) {
+            contOutput.put(Cols.EXTRACT_FILE_LENGTH,
+                    (fps.getExtractFile() == null) ?
+                            "" :
+                    Long.toString(fps.getExtractFileLength()));
+        }
+        try {
+            writer.writeRow(CONTAINER_TABLE, contOutput);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+
+
+        if (extractExceptionType != null) {
+            try {
+                writeExtractException(EXTRACT_EXCEPTION_TABLE, containerIdString,
+                        fps.getRelativeSourceFilePath().toString(), extractExceptionType);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+            return true;
+        }
+
+        List<Integer> numAttachments = countAttachments(metadataList);
+        int i = 0;
+        for (Metadata m : metadataList) {
+            ContentTags contentTags = getContent(fps, m);
+            //the first file should have the same id as the container id
+            String fileId = (i == 0) ? containerIdString : Integer.toString(ID.incrementAndGet());
+            writeTagData(fileId, contentTags, TAGS_TABLE);
+            writeProfileData(fps, i, contentTags, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
+            writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE);
+            writeExceptionData(fileId, m, EXCEPTION_TABLE);
+            try {
+                Map<Class, Object> textStats = calcTextStats(contentTags);
+                writeContentData(fileId, textStats, CONTENTS_TABLE);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+            i++;
+        }
+        return true;
+    }
+
+
+    private void writeEmbeddedPathData(int i, String fileId, Metadata m,
+                                       TableInfo embeddedFilePathTable) {
+        if (i == 0) {
+            return;
+        }
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+        data.put(Cols.EMBEDDED_FILE_PATH,
+                m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
+        try {
+            writer.writeRow(embeddedFilePathTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
new file mode 100644
index 0000000..c9e367e
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.detect.FileCommandDetector;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+
+/**
+ * This class profiles actual files as opposed to extracts e.g. {@link ExtractProfiler}.
+ * This does _not_ parse files, but does run file type identification and digests the
+ * raw bytes.
+ *
+ * If the 'file' command is available on the command line, this will also run the
+ * FileCommandDetector.
+ */
+
+public class FileProfiler extends AbstractProfiler {
+//TODO: we should allow users to select digest type/encoding and file detector(s).
+
+    private static final boolean HAS_FILE = FileCommandDetector.checkHasFile();
+    private static final Logger LOG = LoggerFactory.getLogger(FileProfiler.class);
+
+    static Options OPTIONS;
+    static {
+
+        Option inputDir = new Option("inputDir", true,
+                "optional: directory for original binary input documents."+
+                        " If not specified, -extracts is crawled as is.");
+
+        OPTIONS = new Options()
+                .addOption(inputDir)
+                .addOption("bc", "optional: tika-batch config file")
+                .addOption("numConsumers", true, "optional: number of consumer threads")
+                .addOption("db", true, "db file to which to write results")
+                .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
+                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
+                .addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
+                .addOption("drop", false, "drop tables if they exist")
+                .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
+
+        ;
+
+    }
+
+    public static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(
+                80,
+                "java -jar tika-eval-x.y.jar FileProfiler -inputDir docs -db mydb [-inputDir input]",
+                "Tool: Profile",
+                FileProfiler.OPTIONS,
+                "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
+    }
+
+
+
+    public static TableInfo FILE_PROFILES = HAS_FILE ?
+            new TableInfo("file_profiles",
+                new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
+                new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
+                new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
+                new ColInfo(Cols.LENGTH, Types.BIGINT),
+                new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
+                new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER),
+                new ColInfo(Cols.FILE_MIME_ID, Types.INTEGER))
+            :
+            new TableInfo("file_profiles",
+                    new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
+                    new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
+                    new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
+                    new ColInfo(Cols.LENGTH, Types.BIGINT),
+                    new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
+                    new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER));
+
+
+    public static TableInfo FILE_MIME_TABLE = new TableInfo("file_mimes",
+            new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
+            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
+    );
+
+    public static final String DETECT_EXCEPTION = "detect-exception";
+    private static final Tika TIKA = new Tika();
+
+    private static final FileCommandDetector FILE_COMMAND_DETECTOR = new FileCommandDetector();
+    private final Path inputDir;
+
+    public FileProfiler(ArrayBlockingQueue<FileResource> fileQueue, Path inputDir, IDBWriter dbWriter) {
+        super(fileQueue, dbWriter);
+        this.inputDir = inputDir;
+    }
+
+
+    @Override
+    public boolean processFileResource(FileResource fileResource) {
+        String relPath = fileResource.getMetadata().get(FSProperties.FS_REL_PATH);
+        try (InputStream is = fileResource.openInputStream()) {
+            try (TikaInputStream tis = TikaInputStream.get(is)) {
+                Path path = tis.getPath();
+                Map<Cols, String> data = new HashMap<>();
+                int tikaMimeId = writer.getMimeId(detectTika(tis));
+                String fileName = "";
+                String extension = "";
+                long length = -1;
+                try {
+                    fileName = FilenameUtils.getName(relPath);
+                } catch (IllegalArgumentException e) {
+                    LOG.warn("bad file name: "+relPath, e);
+                }
+
+                try {
+                    extension = FilenameUtils.getExtension(relPath);
+                } catch (IllegalArgumentException e) {
+                    LOG.warn("bad extension: "+relPath, e);
+                }
+
+                try {
+                    length = Files.size(path);
+                } catch (IOException e) {
+                    LOG.warn("problem getting size: "+relPath, e);
+                }
+
+                data.put(Cols.FILE_PATH, relPath);
+                data.put(Cols.FILE_NAME, fileName);
+                data.put(Cols.FILE_EXTENSION, extension);
+                data.put(Cols.LENGTH, Long.toString(length));
+                data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
+                data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
+                if (HAS_FILE) {
+                    int fileMimeId = writer.getMimeId(detectFile(tis));
+                    data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId));
+                }
+                writer.writeRow(FILE_PROFILES, data);
+            }
+        } catch (IOException e) {
+            //log at least!
+            return false;
+        }
+        return true;
+    }
+
+    private String detectFile(TikaInputStream tis) {
+        try {
+            return FILE_COMMAND_DETECTOR.detect(tis, new Metadata()).toString();
+        } catch (IOException e) {
+            return DETECT_EXCEPTION;
+        }
+    }
+
+    private String detectTika(TikaInputStream tis) {
+        try {
+            return TIKA.detect(tis);
+        } catch (IOException e) {
+            return DETECT_EXCEPTION;
+        }
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
new file mode 100644
index 0000000..7e70d44
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.ParseException;
+import org.apache.tika.batch.fs.FSBatchProcessCLI;
+import org.apache.tika.eval.app.reports.ResultsReporter;
+import org.h2.tools.Console;
+
+public class TikaEvalCLI {
+    static final String[] tools = {"Profile", "FileProfile",
+            "Compare", "Report", "StartDB"};
+
+    private static String specifyTools() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("Must specify one of the following tools in the first parameter:\n");
+        for (String s : tools) {
+            sb.append(s+"\n");
+        }
+        return sb.toString();
+
+    }
+
+    private void execute(String[] args) throws Exception {
+        String tool = args[0];
+        String[] subsetArgs = new String[args.length-1];
+        System.arraycopy(args, 1, subsetArgs, 0, args.length - 1);
+        if (tool.equals("Report")) {
+            handleReport(subsetArgs);
+        } else if (tool.equals("Compare")) {
+            handleCompare(subsetArgs);
+        } else if (tool.equals("Profile")) {
+            handleProfile(subsetArgs);
+        } else if (tool.equals("StartDB")) {
+            handleStartDB(subsetArgs);
+        } else if (tool.equals("FileProfile")){
+            handleProfileFiles(subsetArgs);
+        } else {
+            System.out.println(specifyTools());
+        }
+    }
+
+    private void handleProfileFiles(String[] subsetArgs) throws Exception {
+        List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
+
+        boolean containsBC = false;
+        String inputDir = null;
+        //confirm there's a batch-config file
+        for (int i = 0; i < argList.size(); i++) {
+            String arg = argList.get(i);
+            if (arg.equals("-bc")) {
+                containsBC = true;
+            }
+        }
+
+        Path tmpBCConfig = null;
+        try {
+            tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
+            if (! containsBC) {
+                try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-file-profiler-config.xml")) {
+                    Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
+                }
+                argList.add("-bc");
+                argList.add(tmpBCConfig.toAbsolutePath().toString());
+            }
+
+            String[] updatedArgs = argList.toArray(new String[argList.size()]);
+            DefaultParser defaultCLIParser = new DefaultParser();
+            try {
+                CommandLine commandLine = defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs);
+                if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
+                    System.out.println("Please specify either the default -db or the full -jdbc, not both");
+                    ExtractProfiler.USAGE();
+                    return;
+                }
+            } catch (ParseException e) {
+                System.out.println(e.getMessage()+"\n");
+                FileProfiler.USAGE();
+                return;
+            }
+
+            FSBatchProcessCLI.main(updatedArgs);
+        } finally {
+            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
+                Files.delete(tmpBCConfig);
+            }
+        }
+    }
+
+    private void handleStartDB(String[] args) throws SQLException {
+        List<String> argList = new ArrayList<>();
+        argList.add("-web");
+        Console.main(argList.toArray(new String[0]));
+        while(true) {
+            try {
+                Thread.sleep(1000);
+            } catch (InterruptedException e){
+                break;
+            }
+        }
+    }
+
+    private void handleProfile(String[] subsetArgs) throws Exception {
+        List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
+
+        boolean containsBC = false;
+        String inputDir = null;
+        String extracts = null;
+        String alterExtract = null;
+        //confirm there's a batch-config file
+        for (int i = 0; i < argList.size(); i++) {
+            String arg = argList.get(i);
+            if (arg.equals("-bc")) {
+                containsBC = true;
+            } else if (arg.equals("-inputDir")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify directory after -inputDir");
+                    ExtractProfiler.USAGE();
+                    return;
+                }
+                inputDir = argList.get(i+1);
+                i++;
+            } else if (arg.equals("-extracts")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify directory after -extracts");
+                    ExtractProfiler.USAGE();
+                    return;
+                }
+                extracts = argList.get(i+1);
+                i++;
+            } else if (arg.equals("-alterExtract")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify type 'as_is', 'first_only' or " +
+                            "'concatenate_content' after -alterExtract");
+                    ExtractComparer.USAGE();
+                    return;
+                }
+                alterExtract = argList.get(i+1);
+                i++;
+            }
+        }
+
+        if (alterExtract != null && !alterExtract.equals("as_is") &&
+                !alterExtract.equals("concatenate_content") &&
+                !alterExtract.equals("first_only")) {
+            System.out.println("Sorry, I don't understand:"+alterExtract+
+                    ". The values must be one of: as_is, first_only, concatenate_content");
+            ExtractProfiler.USAGE();
+            return;
+        }
+
+        //need to specify each in this commandline
+        //if only extracts is passed to tika-batch,
+        //the crawler will see no inputDir and start crawling "input".
+        //this allows the user to specify either extracts or inputDir
+        if (extracts == null && inputDir != null) {
+            argList.add("-extracts");
+            argList.add(inputDir);
+        } else if (inputDir == null && extracts != null) {
+            argList.add("-inputDir");
+            argList.add(extracts);
+        }
+
+        Path tmpBCConfig = null;
+        try {
+            tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
+            if (! containsBC) {
+                try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-profiler-config.xml")) {
+                    Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
+                }
+                argList.add("-bc");
+                argList.add(tmpBCConfig.toAbsolutePath().toString());
+            }
+
+            String[] updatedArgs = argList.toArray(new String[0]);
+            DefaultParser defaultCLIParser = new DefaultParser();
+            try {
+                CommandLine commandLine = defaultCLIParser.parse(ExtractProfiler.OPTIONS, updatedArgs);
+                if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
+                    System.out.println("Please specify either the default -db or the full -jdbc, not both");
+                    ExtractProfiler.USAGE();
+                    return;
+                }
+            } catch (ParseException e) {
+                System.out.println(e.getMessage()+"\n");
+                ExtractProfiler.USAGE();
+                return;
+            }
+
+            FSBatchProcessCLI.main(updatedArgs);
+        } finally {
+            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
+                Files.delete(tmpBCConfig);
+            }
+        }
+    }
+
+    private void handleCompare(String[] subsetArgs) throws Exception{
+        List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
+
+        boolean containsBC = false;
+        String inputDir = null;
+        String extractsA = null;
+        String alterExtract = null;
+        //confirm there's a batch-config file
+        for (int i = 0; i < argList.size(); i++) {
+            String arg = argList.get(i);
+            if (arg.equals("-bc")) {
+                containsBC = true;
+            } else if (arg.equals("-inputDir")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify directory after -inputDir");
+                    ExtractComparer.USAGE();
+                    return;
+                }
+                inputDir = argList.get(i+1);
+                i++;
+            } else if (arg.equals("-extractsA")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify directory after -extractsA");
+                    ExtractComparer.USAGE();
+                    return;
+                }
+                extractsA = argList.get(i+1);
+                i++;
+            } else if (arg.equals("-alterExtract")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify type 'as_is', 'first_only' or " +
+                            "'concatenate_content' after -alterExtract");
+                    ExtractComparer.USAGE();
+                    return;
+                }
+                alterExtract = argList.get(i+1);
+                i++;
+            }
+        }
+        if (alterExtract != null && !alterExtract.equals("as_is") &&
+                !alterExtract.equals("concatenate_content") &&
+                !alterExtract.equals("first_only")) {
+            System.out.println("Sorry, I don't understand:"+alterExtract+
+            ". The values must be one of: as_is, first_only, concatenate_content");
+            ExtractComparer.USAGE();
+            return;
+        }
+
+        //need to specify each in the commandline that goes into tika-batch
+        //if only extracts is passed to tika-batch,
+        //the crawler will see no inputDir and start crawling "input".
+        //if the user doesn't specify inputDir, crawl extractsA
+        if (inputDir == null && extractsA != null) {
+            argList.add("-inputDir");
+            argList.add(extractsA);
+        }
+
+        Path tmpBCConfig = null;
+        try {
+            tmpBCConfig = Files.createTempFile("tika-eval", ".xml");
+            if (! containsBC) {
+                try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-comparison-config.xml")) {
+                    Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
+                }
+                argList.add("-bc");
+                argList.add(tmpBCConfig.toAbsolutePath().toString());
+
+            }
+            String[] updatedArgs = argList.toArray(new String[0]);
+            DefaultParser defaultCLIParser = new DefaultParser();
+            try {
+                CommandLine commandLine = defaultCLIParser.parse(ExtractComparer.OPTIONS, updatedArgs);
+                if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
+                    System.out.println("Please specify either the default -db or the full -jdbc, not both");
+                    ExtractComparer.USAGE();
+                    return;
+                }
+            } catch (ParseException e) {
+                System.out.println(e.getMessage()+"\n");
+                ExtractComparer.USAGE();
+                return;
+            }
+
+            FSBatchProcessCLI.main(updatedArgs);
+        } finally {
+            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
+                Files.delete(tmpBCConfig);
+            }
+        }
+    }
+
+    private void handleReport(String[] subsetArgs) throws Exception {
+        ResultsReporter.main(subsetArgs);
+    }
+
+    public static void main(String[] args) throws Exception {
+        TikaEvalCLI cli = new TikaEvalCLI();
+        if (args.length == 0) {
+            System.err.println(specifyTools());
+            return;
+        }
+        cli.execute(args);
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/XMLErrorLogUpdater.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/XMLErrorLogUpdater.java
new file mode 100644
index 0000000..9a8862a
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/XMLErrorLogUpdater.java
@@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.log4j.Level;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.H2Util;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.XMLLogMsgHandler;
+import org.apache.tika.eval.app.io.XMLLogReader;
+import org.apache.tika.eval.app.reports.ResultsReporter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a very task specific class that reads a log file and updates
+ * the "comparisons" table.  It should not be run in a multithreaded environment.
+ */
+public class XMLErrorLogUpdater {
+    private static final Logger LOG = LoggerFactory.getLogger(ResultsReporter.class);
+
+    private Statement statement;
+
+    public static void main(String[] args) throws Exception {
+        XMLErrorLogUpdater writer = new XMLErrorLogUpdater();
+        Path xmlLogFileA = Paths.get(args[0]);
+        Path xmlLogFileB = Paths.get(args[1]);
+        Path db = Paths.get(args[2]);
+        JDBCUtil dbUtil = new H2Util(db);
+        Connection connection = dbUtil.getConnection();
+        writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, xmlLogFileA);
+        writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, xmlLogFileB);
+        connection.commit();
+    }
+
+    public void update(Connection connection, TableInfo tableInfo, Path xmlLogFile) throws Exception {
+        statement = connection.createStatement();
+        XMLLogReader reader = new XMLLogReader();
+        try (InputStream is = Files.newInputStream(xmlLogFile)) {
+            reader.read(is, new ErrorMsgUpdater(tableInfo.getName()));
+        } catch (IOException e) {
+            throw new RuntimeException("Problem reading: "+xmlLogFile.toAbsolutePath().toString());
+        } finally {
+            try {
+                connection.commit();
+                statement.close();
+            } catch (SQLException e) {
+                throw new RuntimeException("Failed to close db connection!", e);
+            }
+        }
+    }
+
+    private class ErrorMsgUpdater implements XMLLogMsgHandler {
+        private final String errorTablename;
+
+        private ErrorMsgUpdater(String errorTablename) {
+            this.errorTablename = errorTablename;
+        }
+
+        @Override
+        public void handleMsg(Level level, String xml) throws SQLException, IOException {
+            if (! level.equals(Level.ERROR)) {
+                return;
+            }
+            XMLStreamReader reader = null;
+            try {
+                reader = XMLInputFactory.newInstance().createXMLStreamReader(new StringReader(xml));
+            } catch (XMLStreamException e) {
+                throw new IOExceptionWithCause(e);
+            }
+            String type = null;
+            String resourceId = null;
+            try {
+                while (reader.hasNext() && type == null && resourceId == null) {
+                    reader.next();
+                    switch (reader.getEventType()) {
+                        case XMLStreamConstants.START_ELEMENT:
+                            if ("timed_out".equals(reader.getLocalName())) {
+                                resourceId = reader.getAttributeValue("", "resourceId");
+                                update(errorTablename, resourceId,
+                                        AbstractProfiler.PARSE_ERROR_TYPE.TIMEOUT);
+
+                            } else if ("oom".equals(reader.getLocalName())) {
+                                resourceId = reader.getAttributeValue("", "resourceId");
+                                update(errorTablename, resourceId, AbstractProfiler.PARSE_ERROR_TYPE.OOM);
+                            }
+                            break;
+                    }
+                }
+                reader.close();
+            } catch (XMLStreamException e) {
+                throw new IOExceptionWithCause(e);
+            }
+        }
+
+        private void update(String errorTableName,
+                            String filePath, AbstractProfiler.PARSE_ERROR_TYPE type) throws SQLException {
+            int containerId = getContainerId(filePath);
+            String sql = "SELECT count(1) from "+errorTableName +
+                    " where "+ Cols.CONTAINER_ID +
+                    " = "+containerId + " or "+
+                    Cols.FILE_PATH + "='"+filePath+"'";
+            ResultSet rs = statement.executeQuery(sql);
+
+            //now try to figure out if that file already exists
+            //in parse errors
+            int hitCount = 0;
+            while (rs.next()) {
+                hitCount = rs.getInt(1);
+            }
+
+            //if it does, update all records matching that path or container id
+            if (hitCount > 0) {
+                sql = "UPDATE " + errorTableName +
+                        " SET " + Cols.PARSE_ERROR_ID +
+                        " = " + type.ordinal() + ","+
+                        Cols.FILE_PATH + "='" +filePath+"'"+
+                        " where "+Cols.CONTAINER_ID +
+                        "="+containerId + " or "+
+                        Cols.FILE_PATH + "='"+filePath+"'";;
+
+            } else {
+                //if not and container id > -1
+                //insert full record
+                if (containerId > -1) {
+                    sql = "INSERT INTO " + errorTableName +
+                            " ("+Cols.CONTAINER_ID+","+Cols.FILE_PATH +","+Cols.PARSE_ERROR_ID +")"+
+                            " values (" + containerId + ", '" + filePath + "'," +
+                            type.ordinal() + ");";
+                } else {
+                    //if container id == -1, insert only file path and parse error type id
+                    sql = "INSERT INTO " + errorTableName +
+                            " ("+Cols.FILE_PATH.name()+","+Cols.PARSE_ERROR_ID +")"+
+                            "values ('" + filePath + "'," +
+                            type.ordinal() + ");";
+                }
+
+            }
+            int updated = statement.executeUpdate(sql);
+            if (updated == 0) {
+                //TODO: log
+                LOG.warn("made no updates in xmlerrorlogupdater!");
+            } else if (updated > 1) {
+                LOG.warn("made too many updates");
+            }
+        }
+
+        private int getContainerId(String resourceId) throws SQLException {
+            int containerId = -1;
+            String sql = "SELECT " + Cols.CONTAINER_ID.name() +
+                    " from " + ExtractProfiler.CONTAINER_TABLE.getName()+
+                    " where " + Cols.FILE_PATH +
+                    " ='"+resourceId+"'";
+            ResultSet rs = statement.executeQuery(sql);
+            int resultCount = 0;
+            while (rs.next()) {
+                containerId = rs.getInt(1);
+                resultCount++;
+            }
+            rs.close();
+
+            if (resultCount == 0) {
+                LOG.warn("Should have found a container for: {}", resourceId);
+            } else if (resultCount > 1) {
+                LOG.error("Records ids should be unique: {}", resourceId);
+            }
+/*
+            if (containerId < 0) {
+                System.err.println("CONTAINER ID < 0!!!");
+                sql = "SELECT MAX("+ Cols.CONTAINER_ID.name() +
+                        ") from "+ExtractProfiler.CONTAINER_TABLE.getName();
+                rs = statement.executeQuery(sql);
+                while (rs.next()) {
+                    containerId = rs.getInt(1);
+                }
+                rs.close();
+                if (containerId < 0) {
+                    //log and abort
+                    //return -1?
+                } else {
+                    containerId++;
+                }
+
+            }*/
+            return containerId;
+        }
+
+
+    }
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/DBConsumersManager.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/DBConsumersManager.java
new file mode 100644
index 0000000..7819893
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/DBConsumersManager.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.batch.ConsumersManager;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.eval.app.AbstractProfiler;
+import org.apache.tika.eval.app.XMLErrorLogUpdater;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.eval.app.db.TableInfo;
+
+
+public class DBConsumersManager extends ConsumersManager {
+
+    private final Connection conn;
+    private final MimeBuffer mimeBuffer;
+    private final List<LogTablePair> errorLogs = new ArrayList<>();
+
+    public DBConsumersManager(JDBCUtil dbUtil, MimeBuffer mimeBuffer, List<FileResourceConsumer> consumers)
+            throws SQLException {
+        super(consumers);
+        this.conn = dbUtil.getConnection();
+        this.mimeBuffer = mimeBuffer;
+    }
+
+
+    @Override
+    public void shutdown() {
+
+        for (FileResourceConsumer consumer : getConsumers()) {
+            if (consumer instanceof AbstractProfiler) {
+                try{
+                    ((AbstractProfiler)consumer).closeWriter();
+                } catch (IOException e) {
+                    throw new RuntimeException(e);
+                }
+            }
+        }
+
+        try {
+            mimeBuffer.close();
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        }
+
+        //MUST HAPPEN AFTER consumers have closed and
+        //committed container information!!!
+        XMLErrorLogUpdater up = new XMLErrorLogUpdater();
+        for (LogTablePair p : errorLogs) {
+            try {
+                up.update(conn, p.tableInfo, p.log);
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+
+        try {
+            conn.commit();
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        }
+        try {
+            conn.close();
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void addErrorLogTablePair(Path log, TableInfo tableInfo) {
+        LogTablePair p = new LogTablePair();
+        p.log = log;
+        p.tableInfo = tableInfo;
+        errorLogs.add(p);
+    }
+
+    class LogTablePair {
+        Path log;
+        TableInfo tableInfo;
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumerBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumerBuilder.java
new file mode 100644
index 0000000..9db92a7
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumerBuilder.java
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app.batch;
+
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.app.AbstractProfiler;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.DBWriter;
+import org.apache.tika.eval.app.io.ExtractReader;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.util.PropsUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public abstract class EvalConsumerBuilder {
+
+    private static final Logger LOG = LoggerFactory.getLogger(EvalConsumerBuilder.class);
+
+    protected ArrayBlockingQueue<FileResource> queue;
+    Map<String, String> localAttrs;
+    JDBCUtil dbUtil;
+    private MimeBuffer mimeBuffer;
+    AtomicInteger initialized = new AtomicInteger(0);
+
+    public MimeBuffer init(ArrayBlockingQueue<FileResource> queue, Map<String, String> localAttrs,
+                     JDBCUtil dbUtil, boolean forceDrop) throws IOException, SQLException {
+        if (initialized.getAndIncrement() > 0) {
+            throw new RuntimeException("Can only init a consumer builder once!");
+        }
+        this.queue = queue;
+        this.localAttrs = localAttrs;
+        this.dbUtil = dbUtil;
+        //the order of the following is critical
+        //step 1. update the table names with prefixes
+        updateTableInfosWithPrefixes(localAttrs);
+
+        JDBCUtil.CREATE_TABLE createRegularTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS :
+                JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS;
+
+        JDBCUtil.CREATE_TABLE createRefTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS :
+                JDBCUtil.CREATE_TABLE.SKIP_IF_EXISTS;
+
+        //step 2. create the tables
+        dbUtil.createTables(getNonRefTableInfos(), createRegularTable);
+        dbUtil.createTables(getRefTableInfos(), createRefTable);
+
+        //step 3. create mime buffer
+        this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), getMimeTable(),
+                TikaConfig.getDefaultConfig());
+
+        //step 4. populate the reference tables
+        populateRefTables();
+
+        return mimeBuffer;
+    }
+
+    public abstract FileResourceConsumer build() throws IOException, SQLException;
+
+    protected abstract void updateTableInfosWithPrefixes(Map<String, String> attrs);
+
+    /**
+     *
+     * @return only the ref tables
+     */
+    protected abstract List<TableInfo> getRefTableInfos();
+
+    /**
+     *
+     * @return the main tables, not including the ref tables
+     */
+    protected abstract List<TableInfo> getNonRefTableInfos();
+
+    protected abstract TableInfo getMimeTable();
+
+    protected abstract void addErrorLogTablePairs(DBConsumersManager manager);
+
+    public void populateRefTables() throws IOException, SQLException {
+        boolean refTablesPopulated = true;
+        try{
+            Connection connection = dbUtil.getConnection();
+            for (TableInfo tableInfo : getRefTableInfos()) {
+                int rows = 0;
+                try (ResultSet rs = connection.createStatement().executeQuery("select * from "+
+                        tableInfo.getName())) {
+                    while (rs.next()) {
+                        rows++;
+                    }
+                }
+                if (rows == 0) {
+                    refTablesPopulated = false;
+                    break;
+                }
+
+            }
+        } catch (SQLException e) {
+            //swallow
+        }
+        if (refTablesPopulated) {
+            LOG.info("ref tables are already populated");
+            return;
+        }
+
+        IDBWriter writer = getDBWriter(getRefTableInfos());
+        Map<Cols, String> m = new HashMap<>();
+        for (AbstractProfiler.PARSE_ERROR_TYPE t : AbstractProfiler.PARSE_ERROR_TYPE.values()) {
+            m.clear();
+            m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
+            m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
+            writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
+        }
+
+        for (AbstractProfiler.EXCEPTION_TYPE t : AbstractProfiler.EXCEPTION_TYPE.values()) {
+            m.clear();
+            m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
+            m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
+            writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
+        }
+
+        for (ExtractReaderException.TYPE t :
+                ExtractReaderException.TYPE.values()) {
+            m.clear();
+            m.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(t.ordinal()));
+            m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
+            writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, m);
+        }
+        writer.close();
+    }
+
+    protected IDBWriter getDBWriter(List<TableInfo> tableInfos) throws IOException, SQLException {
+        Connection conn = dbUtil.getConnection();
+        return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer);
+    }
+
+    ExtractReader.ALTER_METADATA_LIST getAlterMetadata(Map<String, String> localAttrs) {
+
+        String alterExtractString = localAttrs.get("alterExtract");
+        ExtractReader.ALTER_METADATA_LIST alterExtractList;
+        if (alterExtractString == null || alterExtractString.equalsIgnoreCase("as_is")) {
+            alterExtractList = ExtractReader.ALTER_METADATA_LIST.AS_IS;
+        } else if (alterExtractString.equalsIgnoreCase("first_only")) {
+            alterExtractList = ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY;
+        } else if (alterExtractString.equalsIgnoreCase("concatenate_content")) {
+            alterExtractList = ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST;
+        } else {
+            throw new RuntimeException("options for alterExtract: as_is, first_only, concatenate_content." +
+                    " I don't understand:" + alterExtractString);
+        }
+        return alterExtractList;
+    }
+
+    protected ExtractReader buildExtractReader(Map<String, String> localAttrs) {
+        long minExtractLength = PropsUtil.getLong(localAttrs.get("minExtractLength"), -1L);
+        long maxExtractLength = PropsUtil.getLong(localAttrs.get("maxExtractLength"), -1L);
+
+        ExtractReader.ALTER_METADATA_LIST alterExtractList = getAlterMetadata(localAttrs);
+        return new ExtractReader(alterExtractList, minExtractLength, maxExtractLength);
+    }
+
+    FileResourceConsumer parameterizeProfiler(AbstractProfiler abstractProfiler) {
+
+        int maxContentLength = PropsUtil.getInt(localAttrs.get("maxContentLength"), -2);
+        if (maxContentLength > -2) {
+            abstractProfiler.setMaxContentLength(maxContentLength);
+        }
+
+        int maxContentLengthForLangId = PropsUtil.getInt(localAttrs.get("maxContentLengthForLangId"), -2);
+        if (maxContentLengthForLangId > -2) {
+            abstractProfiler.setMaxContentLengthForLangId(maxContentLengthForLangId);
+        }
+
+        int maxTokens = PropsUtil.getInt(localAttrs.get("maxTokens"), -2);
+        if (maxTokens > -2) {
+            abstractProfiler.setMaxTokens(maxTokens);
+        }
+
+
+        return abstractProfiler;
+    }
+
+
+/*
+    public abstract Map<String, String> getIndexInfo();
+
+    class ValueComparator implements Comparator<String> {
+
+        Map<String, ColInfo> map;
+
+        public ValueComparator(Map<String, ColInfo> base) {
+            this.map = base;
+        }
+
+        public int compare(String a, String b) {
+            Integer aVal = map.get(a).getDBColOffset();
+            Integer bVal = map.get(b).getDBColOffset();
+            if (aVal == null || bVal == null) {
+                throw new IllegalArgumentException("Column offset must be specified!");
+            }
+            if (aVal == bVal && ! map.get(a).equals(map.get(b))) {
+                throw new IllegalArgumentException("Column offsets must be unique: " + a + " and " + b + " both have: "+aVal);
+            }
+            if (aVal < bVal) {
+                return -1;
+            } else {
+                return 1;
+            }
+        }
+    }
+*/
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumersBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumersBuilder.java
new file mode 100644
index 0000000..32c4bf4
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumersBuilder.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.SQLException;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.tika.batch.ConsumersManager;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.batch.builders.AbstractConsumersBuilder;
+import org.apache.tika.batch.builders.BatchProcessBuilder;
+import org.apache.tika.eval.app.AbstractProfiler;
+import org.apache.tika.eval.app.db.H2Util;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.util.ClassLoaderUtil;
+import org.apache.tika.util.PropsUtil;
+import org.apache.tika.util.XMLDOMUtil;
+import org.w3c.dom.Node;
+
+public class EvalConsumersBuilder extends AbstractConsumersBuilder {
+
+    @Override
+    public ConsumersManager build(Node node, Map<String, String> runtimeAttributes,
+                                  ArrayBlockingQueue<FileResource> queue) {
+
+        List<FileResourceConsumer> consumers = new LinkedList<>();
+        int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
+
+        Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
+
+
+        Path db = getPath(localAttrs, "db");
+        String jdbcConnectionString = localAttrs.get("jdbc");
+
+
+        Path commonTokens = getPath(localAttrs, "commonTokens");
+        String defaultLangCode = localAttrs.get("defaultLangCode");
+        if (defaultLangCode == null) {
+            defaultLangCode = "";
+        }
+        //can be null, in which case will load from memory
+        try {
+            AbstractProfiler.loadCommonTokens(commonTokens, defaultLangCode);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+
+        JDBCUtil jdbcUtil = null;
+        if (db != null) {
+            jdbcUtil = new H2Util(db);
+        } else if (jdbcConnectionString != null) {
+            jdbcUtil = new JDBCUtil(jdbcConnectionString, localAttrs.get("jdbcDriver"));
+        } else {
+            throw new RuntimeException("Must specify: -db or -jdbc");
+        }
+        EvalConsumerBuilder consumerBuilder = ClassLoaderUtil.buildClass(EvalConsumerBuilder.class,
+                PropsUtil.getString(localAttrs.get("consumerBuilderClass"), null));
+        if (consumerBuilder == null) {
+            throw new RuntimeException("Must specify consumerBuilderClass in config file");
+        }
+
+        boolean forceDrop = PropsUtil.getBoolean(localAttrs.get("drop"), false);
+        MimeBuffer mimeBuffer = null;
+        try {
+            mimeBuffer = consumerBuilder.init(queue, localAttrs, jdbcUtil, forceDrop);
+        } catch (IOException | SQLException e) {
+            throw new RuntimeException(e);
+        }
+
+        for (int i = 0; i < numConsumers; i++) {
+            try {
+                consumers.add(consumerBuilder.build());
+            } catch (IOException | SQLException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        DBConsumersManager manager;
+        try {
+            manager = new DBConsumersManager(jdbcUtil, mimeBuffer, consumers);
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        }
+        consumerBuilder.addErrorLogTablePairs(manager);
+
+        return manager;
+    }
+
+    private Path getNonNullPath(Map<String, String> attrs, String key) {
+        Path p = getPath(attrs, key);
+        if (p == null) {
+            throw new RuntimeException("Must specify a file for this attribute: "+key);
+        }
+        return p;
+    }
+
+
+    protected Path getPath(Map<String, String> attrs, String key) {
+        String filePath = attrs.get(key);
+        if (filePath == null) {
+            return null;
+        }
+        return Paths.get(filePath);
+    }
+
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java
new file mode 100644
index 0000000..b0252a9
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.eval.app.AbstractProfiler;
+import org.apache.tika.eval.app.ExtractComparer;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.util.PropsUtil;
+
+public class ExtractComparerBuilder extends EvalConsumerBuilder {
+    public final static String TABLE_PREFIX_A_KEY = "tablePrefixA";
+    public final static String TABLE_PREFIX_B_KEY = "tablePrefixB";
+
+    private final List<TableInfo> tableInfosA;
+    private final List<TableInfo> tableInfosB;
+    private final List<TableInfo> tableInfosAandB;
+    private final List<TableInfo> refTableInfos;
+
+    public ExtractComparerBuilder() {
+        List<TableInfo> tableInfosA = new ArrayList<>();
+        List<TableInfo> tableInfosB = new ArrayList<>();
+        List<TableInfo> tableInfosAandB = new ArrayList<>();
+        tableInfosA.add(ExtractComparer.PROFILES_A);
+        tableInfosA.add(ExtractComparer.EXCEPTION_TABLE_A);
+        tableInfosA.add(ExtractComparer.TAGS_TABLE_A);
+        tableInfosA.add(ExtractComparer.CONTENTS_TABLE_A);
+        tableInfosA.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
+        tableInfosA.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A);
+
+        tableInfosB.add(ExtractComparer.PROFILES_B);
+        tableInfosB.add(ExtractComparer.EXCEPTION_TABLE_B);
+        tableInfosB.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
+        tableInfosB.add(ExtractComparer.TAGS_TABLE_B);
+        tableInfosB.add(ExtractComparer.CONTENTS_TABLE_B);
+        tableInfosB.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B);
+
+        tableInfosAandB.add(ExtractComparer.COMPARISON_CONTAINERS);
+        tableInfosAandB.add(ExtractComparer.CONTENT_COMPARISONS);
+        tableInfosAandB.add(AbstractProfiler.MIME_TABLE);
+
+        List<TableInfo> refTableInfos = new ArrayList<>();
+        refTableInfos.add(ExtractComparer.REF_PAIR_NAMES);
+        refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
+        refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
+        refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
+
+        this.tableInfosA = Collections.unmodifiableList(tableInfosA);
+        this.tableInfosB = Collections.unmodifiableList(tableInfosB);
+        this.tableInfosAandB = Collections.unmodifiableList(tableInfosAandB);
+        this.refTableInfos = Collections.unmodifiableList(refTableInfos);
+    }
+
+    @Override
+    public FileResourceConsumer build() throws IOException, SQLException {
+        Path extractsA = PropsUtil.getPath(localAttrs.get("extractsA"), null);
+        if (extractsA == null) {
+            throw new RuntimeException("Must specify \"extractsA\" -- directory for 'A' extracts");
+        }
+        Path extractsB = PropsUtil.getPath(localAttrs.get("extractsB"), null);
+        if (extractsB == null) {
+            throw new RuntimeException("Must specify \"extractsB\" -- directory for 'B' extracts");
+        }
+
+        Path inputRootDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
+
+        if (inputRootDir == null) {
+            //this is for the sake of the crawler
+            throw new RuntimeException("Must specify an -inputDir");
+        }
+
+        return parameterizeProfiler(new ExtractComparer(queue, inputRootDir, extractsA, extractsB,
+                buildExtractReader(localAttrs),
+                getDBWriter(getNonRefTableInfos())));
+    }
+
+
+    @Override
+    protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
+        String tablePrefixA = localAttrs.get(TABLE_PREFIX_A_KEY);
+
+        String tablePrefixB = localAttrs.get(TABLE_PREFIX_B_KEY);
+
+        tablePrefixA = (tablePrefixA == null || tablePrefixA.endsWith("_")) ? tablePrefixA : tablePrefixA+"_";
+        tablePrefixB = (tablePrefixB == null || tablePrefixB.endsWith("_")) ? tablePrefixB : tablePrefixB+"_";
+
+        if (tablePrefixA != null) {
+            for (TableInfo tableInfo : tableInfosA) {
+                tableInfo.setNamePrefix(tablePrefixA);
+            }
+        }
+
+        if (tablePrefixB != null) {
+            for (TableInfo tableInfo : tableInfosB) {
+                tableInfo.setNamePrefix(tablePrefixB);
+            }
+        }
+
+        if (tablePrefixA != null || tablePrefixB != null) {
+            String aAndB = (tablePrefixA == null) ? "" : tablePrefixA;
+            aAndB = (tablePrefixB == null) ? aAndB : aAndB+tablePrefixB;
+            for (TableInfo tableInfo : tableInfosAandB) {
+                tableInfo.setNamePrefix(aAndB);
+            }
+        }
+    }
+
+    @Override
+    protected List<TableInfo> getRefTableInfos() {
+        return refTableInfos;
+    }
+
+    @Override
+    protected List<TableInfo> getNonRefTableInfos() {
+        List<TableInfo> allNonRefTables = new ArrayList<>();
+        allNonRefTables.addAll(tableInfosA);
+        allNonRefTables.addAll(tableInfosB);
+        allNonRefTables.addAll(tableInfosAandB);
+        return Collections.unmodifiableList(allNonRefTables);
+    }
+
+    @Override
+    protected TableInfo getMimeTable() {
+        return AbstractProfiler.MIME_TABLE;
+    }
+
+    @Override
+    protected void addErrorLogTablePairs(DBConsumersManager manager) {
+        Path errorLogA = PropsUtil.getPath(localAttrs.get("errorLogFileA"), null);
+        if (errorLogA == null) {
+            return;
+        }
+        manager.addErrorLogTablePair(errorLogA, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
+        Path errorLogB = PropsUtil.getPath(localAttrs.get("errorLogFileB"), null);
+        if (errorLogB == null) {
+            return;
+        }
+        manager.addErrorLogTablePair(errorLogB, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
+
+    }
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
new file mode 100644
index 0000000..69c34e1
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.eval.app.AbstractProfiler;
+import org.apache.tika.eval.app.ExtractProfiler;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.util.PropsUtil;
+
+
+public class ExtractProfilerBuilder extends EvalConsumerBuilder {
+
+    public final static String TABLE_PREFIX_KEY = "tablePrefix";
+
+    private final List<TableInfo> tableInfos;
+    private final List<TableInfo> refTableInfos;
+    public ExtractProfilerBuilder() {
+        List<TableInfo> tableInfos = new ArrayList();
+        tableInfos.add(AbstractProfiler.MIME_TABLE);
+        tableInfos.add(ExtractProfiler.CONTAINER_TABLE);
+        tableInfos.add(ExtractProfiler.PROFILE_TABLE);
+        tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+        tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
+        tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
+        tableInfos.add(ExtractProfiler.TAGS_TABLE);
+        tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
+        this.tableInfos = Collections.unmodifiableList(tableInfos);
+
+        List<TableInfo> refTableInfos = new ArrayList<>();
+        refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
+        refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
+        refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
+        this.refTableInfos = Collections.unmodifiableList(refTableInfos);
+    }
+
+    @Override
+    public FileResourceConsumer build() throws IOException, SQLException {
+        Path extracts = PropsUtil.getPath(localAttrs.get("extracts"), null);
+        if (extracts == null) {
+            throw new RuntimeException("Must specify \"extracts\" -- directory to crawl");
+        }
+        if (!Files.isDirectory(extracts)) {
+            throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " +
+                    extracts.toAbsolutePath());
+        }
+
+        Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
+
+        //we _could_ set this to extracts (if not null)
+        //here, but the Crawler defaults to "input" if nothing is passed
+        //so this won't work
+        if (inputDir == null) {
+            throw new RuntimeException("Must specify -inputDir");
+        }
+        if (extracts == null && inputDir != null) {
+            extracts = inputDir;
+        }
+        return parameterizeProfiler(new ExtractProfiler(queue, inputDir, extracts,
+                buildExtractReader(localAttrs),
+                getDBWriter(tableInfos)));
+    }
+
+
+    @Override
+    protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
+        String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
+        if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
+            for (TableInfo tableInfo : tableInfos) {
+                tableInfo.setNamePrefix(tableNamePrefix);
+            }
+        }
+    }
+
+
+    @Override
+    protected List<TableInfo> getRefTableInfos() {
+        return refTableInfos;
+    }
+
+    @Override
+    protected List<TableInfo> getNonRefTableInfos() {
+        return tableInfos;
+    }
+
+    @Override
+    protected TableInfo getMimeTable() {
+        return AbstractProfiler.MIME_TABLE;
+    }
+
+    @Override
+    protected void addErrorLogTablePairs(DBConsumersManager manager) {
+        Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
+        if (errorLog == null) {
+            return;
+        }
+        manager.addErrorLogTablePair(errorLog, ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java
new file mode 100644
index 0000000..29412d3
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+import org.apache.tika.batch.FileResourceConsumer;
+
+import org.apache.tika.eval.app.ExtractProfiler;
+import org.apache.tika.eval.app.FileProfiler;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.util.PropsUtil;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+
+public class FileProfilerBuilder extends EvalConsumerBuilder {
+
+    public final static String TABLE_PREFIX_KEY = "tablePrefix";
+
+    private final List<TableInfo> tableInfos;
+
+    public FileProfilerBuilder() {
+        List<TableInfo> tableInfos = new ArrayList();
+        tableInfos.add(FileProfiler.FILE_MIME_TABLE);
+        tableInfos.add(FileProfiler.FILE_PROFILES);
+        this.tableInfos = Collections.unmodifiableList(tableInfos);
+
+    }
+
+    @Override
+    public FileResourceConsumer build() throws IOException, SQLException {
+
+        Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
+
+        //we _could_ set this to extracts (if not null)
+        //here, but the Crawler defaults to "input" if nothing is passed
+        //so this won't work
+        if (inputDir == null) {
+            throw new RuntimeException("Must specify -inputDir");
+        }
+        return parameterizeProfiler(new FileProfiler(queue, inputDir,
+                getDBWriter(tableInfos)));
+    }
+
+
+    @Override
+    protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
+        String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
+        if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
+            for (TableInfo tableInfo : tableInfos) {
+                tableInfo.setNamePrefix(tableNamePrefix);
+            }
+        }
+    }
+
+    @Override
+    protected List<TableInfo> getRefTableInfos() {
+        return Collections.EMPTY_LIST;
+    }
+
+    @Override
+    protected List<TableInfo> getNonRefTableInfos() {
+        return tableInfos;
+    }
+
+    @Override
+    protected TableInfo getMimeTable() {
+        return FileProfiler.FILE_MIME_TABLE;
+    }
+
+    @Override
+    protected void addErrorLogTablePairs(DBConsumersManager manager) {
+        Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
+        if (errorLog == null) {
+            return;
+        }
+        manager.addErrorLogTablePair(errorLog, ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/AbstractDBBuffer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/AbstractDBBuffer.java
new file mode 100644
index 0000000..03e122c
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/AbstractDBBuffer.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+
+/**
+ * Abstract buffer for map of values and unique ids.
+ * <p>
+ * Use this for fast in memory lookups of smallish sets of values.
+ *
+ */
+abstract class AbstractDBBuffer {
+
+    private final Map<String, Integer> m = new HashMap<>();
+    private final ReentrantReadWriteLock rwl = new ReentrantReadWriteLock();
+    private final Lock r = rwl.readLock();
+    private final Lock w = rwl.writeLock();
+
+    private int numWrites = 0;
+
+    public int getId(String key) {
+        r.lock();
+        try {
+            Integer v = m.get(key);
+            if (v != null) {
+                return v;
+            }
+        } finally {
+            r.unlock();
+        }
+
+        try {
+            w.lock();
+            Integer v = m.get(key);
+            if (v != null) {
+                return v;
+            }
+            v = m.size()+1;
+            m.put(key, v);
+            write(v, key);
+            numWrites++;
+            return v;
+        } finally {
+            w.unlock();
+        }
+    }
+
+    public int getNumWrites() {
+        return numWrites;
+    }
+
+    //Odd to throw RuntimeException, I know.  It should be
+    //catastrophic if this buffer can't write to the db.
+    public abstract void write(int id, String value) throws RuntimeException;
+
+    public abstract void close() throws SQLException;
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/ColInfo.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/ColInfo.java
new file mode 100644
index 0000000..a251e25
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/ColInfo.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+
+import java.sql.Types;
+import java.util.Objects;
+
+public class ColInfo {
+    private final Cols name;
+    private final int type;
+    private final Integer precision;
+    private final String constraints;
+
+    public ColInfo(Cols name, int type) {
+        this(name, type, null, null);
+    }
+
+    public ColInfo(Cols name, int type, String constraints) {
+        this(name, type, null, constraints);
+    }
+
+    public ColInfo(Cols name, int type, Integer precision) {
+        this(name, type, precision, null);
+    }
+
+
+    public ColInfo(Cols name, int type, Integer precision, String constraints) {
+        this.name = name;
+        this.type = type;
+        this.precision = precision;
+        this.constraints = constraints;
+    }
+
+    public int getType() {
+        return type;
+    }
+
+    public Cols getName() {
+        return name;
+    }
+    /**
+     *
+     * @return constraints string or null
+     */
+    public String getConstraints() {
+        return constraints;
+    }
+
+    /**
+     * Gets the precision.  This can be null!
+     * @return precision or null
+     */
+    public Integer getPrecision() {
+        return precision;
+    }
+
+    public String getSqlDef() {
+        if (type == Types.VARCHAR){
+            return "VARCHAR("+precision+")";
+        } else if (type == Types.CHAR) {
+            return "CHAR("+precision+")";
+        }
+        switch (type) {
+            case Types.FLOAT :
+                return "FLOAT";
+            case Types.DOUBLE :
+                return "DOUBLE";
+            case Types.BLOB :
+                return "BLOB";
+            case Types.INTEGER :
+                return "INTEGER";
+            case Types.BIGINT :
+                return "BIGINT";
+            case Types.BOOLEAN :
+                return "BOOLEAN";
+        }
+        throw new UnsupportedOperationException("Don't yet recognize a type for: "+type);
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+
+        ColInfo colInfo = (ColInfo) o;
+
+        if (type != colInfo.type) {
+            return false;
+        }
+        if (name != colInfo.name) {
+            return false;
+        }
+        if (!Objects.equals(precision, colInfo.precision)) {
+            return false;
+        }
+        return Objects.equals(constraints, colInfo.constraints);
+
+    }
+
+    @Override
+    public int hashCode() {
+        int result = name != null ? name.hashCode() : 0;
+        result = 31 * result + type;
+        result = 31 * result + (precision != null ? precision.hashCode() : 0);
+        result = 31 * result + (constraints != null ? constraints.hashCode() : 0);
+        return result;
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
new file mode 100644
index 0000000..883210f
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+public enum Cols {
+    //container table
+    CONTAINER_ID,
+    FILE_PATH,
+    EXTRACT_FILE_LENGTH,
+
+    EXTRACT_FILE_LENGTH_A, //for comparisons
+    EXTRACT_FILE_LENGTH_B,
+
+    //profile table
+    ID,
+    LENGTH,
+    FILE_NAME,
+    FILE_EXTENSION,
+    ELAPSED_TIME_MILLIS,
+    NUM_METADATA_VALUES,
+    IS_EMBEDDED,
+    EMBEDDED_FILE_PATH,
+    MIME_ID,
+    TIKA_MIME_ID,
+    FILE_MIME_ID,
+    SHA256,
+    MD5,
+    NUM_ATTACHMENTS,
+    HAS_CONTENT,
+
+    //content
+    CONTENT_LENGTH,
+    NUM_UNIQUE_TOKENS,
+    NUM_TOKENS,
+    NUM_UNIQUE_ALPHABETIC_TOKENS,
+    NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens
+    COMMON_TOKENS_LANG, //which language was used for the common tokens metric?
+    NUM_UNIQUE_COMMON_TOKENS,
+    NUM_COMMON_TOKENS,
+    TOP_N_TOKENS,
+    LANG_ID_1,
+    LANG_ID_PROB_1,
+    LANG_ID_2,
+    LANG_ID_PROB_2,
+    TOKEN_ENTROPY_RATE,
+    TOKEN_LENGTH_SUM,
+    TOKEN_LENGTH_MEAN,
+    TOKEN_LENGTH_STD_DEV,
+    UNICODE_CHAR_BLOCKS,
+    NUM_PAGES, //number of pages a document alleges it has
+    CONTENT_TRUNCATED_AT_MAX_LEN, // was the string truncated at AbstractProfiler.MAX_STRING_LENGTH
+
+    //content comparisons
+    TOP_10_UNIQUE_TOKEN_DIFFS_A,
+    TOP_10_UNIQUE_TOKEN_DIFFS_B,
+    TOP_10_MORE_IN_A,
+    TOP_10_MORE_IN_B,
+    OVERLAP,
+    DICE_COEFFICIENT,
+
+    //errors
+    PARSE_ERROR_ID,
+
+    PARSE_ERROR_DESCRIPTION,
+    PARSE_EXCEPTION_DESCRIPTION,
+
+    EXTRACT_EXCEPTION_ID,
+    EXTRACT_EXCEPTION_DESCRIPTION,
+
+
+    //exceptions
+    ORIG_STACK_TRACE,
+    SORT_STACK_TRACE,
+    PARSE_EXCEPTION_ID,
+
+
+    MIME_STRING,//string representation of mime type
+
+    DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
+    DIR_NAME_B,
+    
+    //structure tags
+    TAGS_A,
+    TAGS_B,
+    TAGS_DIV,
+    TAGS_I,
+    TAGS_IMG,
+    TAGS_LI,
+    TAGS_P,
+    TAGS_OL,
+    TAGS_TABLE,
+    TAGS_TD,
+    TAGS_TITLE,
+    TAGS_TR,
+    TAGS_UL,
+    TAGS_U,
+    TAGS_PARSE_EXCEPTION, //if there was a SAX|IO|TikaException while parsing the html or xhtml
+    ;
+
+}
+
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/DBBuffer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/DBBuffer.java
new file mode 100644
index 0000000..2cc8cf7
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/DBBuffer.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+
+public class DBBuffer extends AbstractDBBuffer {
+
+    private final PreparedStatement st;
+
+    public DBBuffer(Connection connection, String tableName,
+                    String idColumnName, String valueColumnName) throws SQLException {
+        st = connection.prepareStatement("insert into "+tableName+ "( "+
+                idColumnName + ", " + valueColumnName+") values (?,?);");
+    }
+
+    @Override
+    public void write(int id, String value) throws RuntimeException {
+        try {
+            st.clearParameters();
+            st.setInt(1, id);
+            st.setString(2, value);
+            st.execute();
+
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    @Override
+    public void close() throws SQLException {
+        st.close();
+
+    }
+
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/H2Util.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/H2Util.java
new file mode 100644
index 0000000..87501c6
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/H2Util.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.FilenameUtils;
+
+
+public class H2Util extends JDBCUtil {
+
+    private final static String DRIVER_CLASS = "org.h2.Driver";
+
+    private final Path db;
+
+    public H2Util(Path db) {
+        super(getConnectionString(db, true), DRIVER_CLASS);
+        this.db = db;
+    }
+
+    public static boolean databaseExists(Path db) {
+        String connString = getConnectionString(db, false);
+        try (Connection conn = DriverManager.getConnection(connString)) {
+        } catch (SQLException e) {
+            return false;
+        }
+        return true;
+    }
+    @Override
+    public String getJDBCDriverClass() {
+        return "org.h2.Driver";
+    }
+
+    @Override
+    public boolean dropTableIfExists(Connection conn, String tableName) throws SQLException {
+        Statement st = conn.createStatement();
+        String sql = "drop table if exists "+tableName;
+        boolean success = st.execute(sql);
+        st.close();
+        return success;
+    }
+
+    @Override
+    public String getConnectionString() {
+        return getConnectionString(db, true);
+    }
+
+    private static String getConnectionString(Path db, boolean createDBIfItDoesntExist) {
+        String s = "jdbc:h2:"+ FilenameUtils.separatorsToUnix(db.toAbsolutePath().toString());
+        if (! createDBIfItDoesntExist) {
+            s += ";IFEXISTS=TRUE";
+        }
+        return s;
+    }
+
+    @Override
+    public Set<String> getTables(Connection connection) throws SQLException {
+        String sql = "SHOW TABLES";
+        Statement st = connection.createStatement();
+        ResultSet rs = st.executeQuery(sql);
+        Set<String> tables = new HashSet<>();
+        while (rs.next()) {
+            String table = rs.getString(1);
+            tables.add(table);
+        }
+        return tables;
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/JDBCUtil.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/JDBCUtil.java
new file mode 100644
index 0000000..0211b53
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/JDBCUtil.java
@@ -0,0 +1,316 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app.db;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Connection;
+import java.sql.DatabaseMetaData;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.sql.Types;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class JDBCUtil {
+    private static final Logger LOG = LoggerFactory.getLogger(JDBCUtil.class);
+
+    public enum CREATE_TABLE {
+        DROP_IF_EXISTS,
+        SKIP_IF_EXISTS,
+        THROW_EX_IF_EXISTS,
+    }
+
+    private final String connectionString;
+    private String driverClass;
+    private Connection connection = null;
+
+    public JDBCUtil(String connectionString, String driverClass) {
+        this.connectionString = connectionString;
+        this.driverClass = driverClass;
+        if (driverClass == null || driverClass.length() == 0) {
+            if (System.getProperty("jdbc.drivers") != null) {
+                //user has specified it on the command line
+                //stop now
+            } else {
+                //try to use the mappings in db.properties to determine the class
+                try (InputStream is = JDBCUtil.class.getResourceAsStream("/db.properties")) {
+                    Properties properties = new Properties();
+                    properties.load(is);
+                    for (String k : properties.stringPropertyNames()) {
+                        Matcher m = Pattern.compile("(?i)jdbc:" + k).matcher(connectionString);
+                        if (m.find()) {
+                            this.driverClass = properties.getProperty(k);
+                        }
+                    }
+
+                } catch (IOException e) {
+
+                }
+            }
+        }
+    }
+
+    /**
+     * Override this any optimizations you want to do on the db
+     * before writing/reading.
+     *
+     * @return
+     * @throws IOException
+     */
+    public Connection getConnection() throws SQLException {
+        if (connection != null) {
+            return connection;
+        }
+        String connectionString = getConnectionString();
+        String jdbcDriver = getJDBCDriverClass();
+        if (jdbcDriver != null) {
+            try {
+                Class.forName(getJDBCDriverClass());
+            } catch (ClassNotFoundException e) {
+                throw new RuntimeException(e);
+            }
+        }
+        connection = DriverManager.getConnection(connectionString);
+        connection.setAutoCommit(false);
+
+        return connection;
+    }
+
+    /**
+     * JDBC driver class.  Override as necessary.
+     *
+     * @return
+     */
+    public String getJDBCDriverClass() {
+        return driverClass;
+    }
+
+
+    public boolean dropTableIfExists(Connection conn, String tableName) throws SQLException {
+        if (containsTable(tableName)) {
+            try (Statement st = conn.createStatement()) {
+                String sql = "drop table " + tableName;
+                return st.execute(sql);
+            }
+        }
+        return true;
+    }
+
+
+    public String getConnectionString() {
+        return connectionString;
+    }
+
+
+    public Set<String> getTables(Connection connection) throws SQLException {
+        Set<String> tables = new HashSet<>();
+
+        DatabaseMetaData dbMeta = connection.getMetaData();
+
+        try (ResultSet rs = dbMeta.getTables(null, null, "%", null)) {
+            while (rs.next()) {
+                tables.add(rs.getString(3).toLowerCase(Locale.US));
+            }
+        }
+        return tables;
+    }
+
+    @Deprecated
+    /**
+     * @deprecated use {@link #batchInsert(PreparedStatement, TableInfo, Map)}
+     */
+    public static int insert(PreparedStatement insertStatement,
+                             TableInfo table,
+                             Map<Cols, String> data) throws SQLException {
+
+        //clear parameters before setting
+        insertStatement.clearParameters();
+        try {
+            int i = 1;
+            for (ColInfo colInfo : table.getColInfos()) {
+                updateInsertStatement(i, insertStatement, colInfo, data.get(colInfo.getName()));
+                i++;
+            }
+            for (Cols c : data.keySet()) {
+                if (!table.containsColumn(c)) {
+                    throw new IllegalArgumentException("Can't add data to " + c +
+                            " because it doesn't exist in the table: " + table.getName());
+                }
+            }
+            return insertStatement.executeUpdate();
+        } catch (SQLException e) {
+            LOG.warn("couldn't insert data for this row: {}", e.getMessage());
+            return -1;
+        }
+    }
+
+    public static void batchInsert(PreparedStatement insertStatement,
+                                   TableInfo table,
+                                   Map<Cols, String> data) throws SQLException {
+
+        try {
+            int i = 1;
+            for (ColInfo colInfo : table.getColInfos()) {
+                updateInsertStatement(i, insertStatement, colInfo, data.get(colInfo.getName()));
+                i++;
+            }
+            for (Cols c : data.keySet()) {
+                if (!table.containsColumn(c)) {
+                    throw new IllegalArgumentException("Can't add data to " + c +
+                            " because it doesn't exist in the table: " + table.getName());
+                }
+            }
+            insertStatement.addBatch();
+        } catch (SQLException e) {
+            LOG.warn("couldn't insert data for this row: {}", e.getMessage());
+        }
+    }
+
+    public static void updateInsertStatement(int dbColOffset, PreparedStatement st,
+                                             ColInfo colInfo, String value) throws SQLException {
+        if (value == null) {
+            st.setNull(dbColOffset, colInfo.getType());
+            return;
+        }
+        try {
+            switch (colInfo.getType()) {
+                case Types.VARCHAR:
+                    if (value != null && value.length() > colInfo.getPrecision()) {
+                        value = value.substring(0, colInfo.getPrecision());
+                        LOG.warn("truncated varchar value in {} : {}", colInfo.getName(), value);
+                    }
+                    //postgres doesn't allow \0000
+                    value = value.replaceAll("\u0000", " ");
+                    st.setString(dbColOffset, value);
+                    break;
+                case Types.CHAR:
+                    //postgres doesn't allow \0000
+                    value = value.replaceAll("\u0000", " ");
+                    st.setString(dbColOffset, value);
+                    break;
+                case Types.DOUBLE:
+                    st.setDouble(dbColOffset, Double.parseDouble(value));
+                    break;
+                case Types.FLOAT:
+                    st.setDouble(dbColOffset, Float.parseFloat(value));
+                    break;
+                case Types.INTEGER:
+                    st.setInt(dbColOffset, Integer.parseInt(value));
+                    break;
+                case Types.BIGINT:
+                    st.setLong(dbColOffset, Long.parseLong(value));
+                    break;
+                case Types.BOOLEAN:
+                    st.setBoolean(dbColOffset, Boolean.parseBoolean(value));
+                    break;
+                default:
+                    throw new UnsupportedOperationException("Don't yet support type: " + colInfo.getType());
+            }
+        } catch (NumberFormatException e) {
+            if (!"".equals(value)) {
+                LOG.warn("number format exception: {} : {}", colInfo.getName(), value);
+            }
+            st.setNull(dbColOffset, colInfo.getType());
+        } catch (SQLException e) {
+            LOG.warn("sqlexception: {} : {}", colInfo, value);
+            st.setNull(dbColOffset, colInfo.getType());
+        }
+    }
+
+    public void createTables(List<TableInfo> tableInfos, CREATE_TABLE createTable) throws SQLException, IOException {
+
+        Connection conn = getConnection();
+        for (TableInfo tableInfo : tableInfos) {
+
+            if (createTable.equals(CREATE_TABLE.DROP_IF_EXISTS)) {
+                dropTableIfExists(conn, tableInfo.getName());
+            } else if (createTable.equals(CREATE_TABLE.SKIP_IF_EXISTS)) {
+                if (containsTable(tableInfo.getName())) {
+                    continue;
+                }
+            }
+            createTable(conn, tableInfo);
+        }
+        conn.commit();
+
+    }
+
+    public boolean containsTable(String tableName) throws SQLException {
+        Connection connection = getConnection();
+        Set<String> tables = getTables(connection);
+        if (tables.contains(normalizeTableName(tableName))) {
+            return true;
+        }
+
+        return false;
+    }
+
+    /**
+     * Override for custom behavior
+     *
+     * @param tableName
+     * @return
+     */
+    String normalizeTableName(String tableName) {
+        tableName = tableName.toLowerCase(Locale.US);
+        return tableName;
+    }
+
+    //does not close the connection
+    private void createTable(Connection conn, TableInfo tableInfo) throws SQLException {
+        StringBuilder createSql = new StringBuilder();
+        createSql.append("CREATE TABLE " + tableInfo.getName());
+        createSql.append("(");
+
+        int last = 0;
+        for (ColInfo col : tableInfo.getColInfos()) {
+            last++;
+            if (last > 1) {
+                createSql.append(", ");
+            }
+            createSql.append(col.getName());
+            createSql.append(" ");
+            createSql.append(col.getSqlDef());
+            String constraints = col.getConstraints();
+            if (constraints != null) {
+                createSql.append(" ");
+                createSql.append(constraints);
+            }
+        }
+        createSql.append(")");
+        try (Statement st = conn.createStatement()) {
+            st.execute(createSql.toString());
+            st.close();
+        }
+        conn.commit();
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java
new file mode 100644
index 0000000..235f282
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Types;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+
+
+public class MimeBuffer extends AbstractDBBuffer {
+
+    private final PreparedStatement st;
+    private final TikaConfig config;
+    private final Connection connection;
+
+
+    public MimeBuffer(Connection connection, TableInfo mimeTable, TikaConfig config) throws SQLException {
+        st = connection.prepareStatement("insert into " + mimeTable.getName() + "( " +
+                Cols.MIME_ID.name() + ", " +
+                Cols.MIME_STRING.name() + ", " +
+                Cols.FILE_EXTENSION.name() + ") values (?,?,?)");
+        this.config = config;
+        this.connection = connection;
+    }
+
+    @Override
+    public void write(int id, String value) throws RuntimeException {
+        try {
+            st.clearParameters();
+            st.setInt(1, id);
+            st.setString(2, value);
+            try {
+                String ext = MimeUtil.getExtension(value, config);
+                if (ext == null || ext.length() == 0) {
+                    st.setNull(3, Types.VARCHAR);
+                } else {
+                    st.setString(3, ext);
+                }
+            } catch (MimeTypeException e) {
+                st.setNull(3, Types.VARCHAR);
+            }
+            st.execute();
+
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    @Override
+    public void close() throws SQLException {
+        st.close();
+        connection.commit();
+    }
+
+    private static class MimeUtil {
+        //TODO: see if MimeType now works for these
+        private static final String APPLICATION = "application";
+        private static final String TEXT = "text";
+        private static final String HTML = "html";
+        private static final String XML = "xml";
+        private static final String XHTML_XML = "xhtml+xml";
+        private static final String CSS = "css";
+        private static final String CSV = "csv";
+        private static final String PLAIN = "plain";
+        private static final String EMPTY_STRING = "";
+
+        /**
+         * Utility method to convert from a string value representing a content type
+         * (e.g. "application/pdf") into the most common extension for that file type
+         * (e.g. "pdf").
+         * <p>
+         * This will has special handling for texty filetypes whose MimeTypes
+         * don't currently return anything for {@link MimeType#getExtension};
+         *
+         * @param contentType string representing a content type, for example: "application/pdf"
+         * @param config      config from which to get MimeRepository
+         * @return extension or empty string
+         * @throws MimeTypeException thrown if MimeTypes can't parse the contentType
+         */
+        public static String getExtension(String contentType, TikaConfig config)
+                throws MimeTypeException {
+            MimeTypes types = config.getMimeRepository();
+            MimeType mime = types.forName(contentType);
+            return getExtension(mime);
+        }
+
+        public static String getExtension(MimeType mime) {
+
+            String ext = mime.getExtension();
+            if (ext.startsWith(".")) {
+                ext = ext.substring(1);
+            }
+
+            //special handling for text/html/xml
+            if (ext.length() == 0) {
+                ext = tryTextyTypes(mime.getType());
+            }
+            return ext;
+        }
+
+        private static String tryTextyTypes(MediaType mediaType) {
+
+            String type = mediaType.getType();
+            String subtype = mediaType.getSubtype();
+            if (type.equals(TEXT)) {
+                if (subtype.equals(HTML)) {
+                    return HTML;
+                } else if (subtype.equals(PLAIN)) {
+                    return "txt";
+                } else if (subtype.equals(CSS)) {
+                    return CSS;
+                } else if (subtype.equals(CSV)) {
+                    return CSV;
+                }
+            } else if (type.equals(APPLICATION)) {
+                if (subtype.equals(XML)) {
+                    return XML;
+                } else if (subtype.equals(XHTML_XML)) {
+                    return "html";
+                }
+            }
+            return EMPTY_STRING;
+        }
+    }
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/TableInfo.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/TableInfo.java
new file mode 100644
index 0000000..a99cea4
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/TableInfo.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+public class TableInfo {
+
+    private final String name;
+    private final List<ColInfo> colInfos = new ArrayList<>();
+    private final Set<Cols> colNames = new HashSet<>();
+
+    private String prefix;
+
+    public TableInfo(String name, ColInfo... cols) {
+        Collections.addAll(colInfos, cols);
+        Collections.unmodifiableList(colInfos);
+        this.name = name;
+        for (ColInfo c : colInfos) {
+            assert (!colNames.contains(c.getName()));
+            colNames.add(c.getName());
+        }
+    }
+
+    public TableInfo(String name, List<ColInfo> cols) {
+        colInfos.addAll(cols);
+        Collections.unmodifiableList(colInfos);
+        this.name = name;
+        for (ColInfo c : colInfos) {
+            assert (!colNames.contains(c.getName()));
+            colNames.add(c.getName());
+        }
+    }
+
+    public String getName() {
+        if (prefix == null) {
+            return name;
+        }
+        return prefix+name;
+    }
+
+    public void setNamePrefix(String prefix) {
+        this.prefix = prefix;
+    }
+
+    public List<ColInfo> getColInfos() {
+        return colInfos;
+    }
+
+    public boolean containsColumn(Cols cols) {
+        return colNames.contains(cols);
+    }
+}
+
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/DBWriter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/DBWriter.java
new file mode 100644
index 0000000..4a304c2
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/DBWriter.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.io;
+
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is still in its early stages.  The idea is to
+ * get something working with h2 and then add to that
+ * as necessary.
+ *
+ * Beware, this deletes the db file with each initialization.
+ *
+ * Each thread must construct its own DBWriter because each
+ * DBWriter creates its own PreparedStatements at initialization.
+ */
+public class DBWriter implements IDBWriter {
+
+    private static final Logger LOG = LoggerFactory.getLogger(DBWriter.class);
+
+    private static final AtomicInteger WRITER_ID = new AtomicInteger();
+    private final Long commitEveryXRows = 10000L;
+    //private final Long commitEveryXMS = 60000L;
+
+    private final Connection conn;
+    private final JDBCUtil dbUtil;
+    private final MimeBuffer mimeBuffer;
+    private final int myId = WRITER_ID.getAndIncrement();
+
+    //<tableName, preparedStatement>
+    private final Map<String, PreparedStatement> inserts = new HashMap<>();
+    private final Map<String, LastInsert> lastInsertMap = new HashMap<>();
+    public DBWriter(Connection connection, List<TableInfo> tableInfos, JDBCUtil dbUtil, MimeBuffer mimeBuffer)
+            throws IOException, SQLException {
+
+        this.conn = connection;
+        this.mimeBuffer = mimeBuffer;
+        this.dbUtil = dbUtil;
+        for (TableInfo tableInfo : tableInfos) {
+            try {
+                PreparedStatement st = createPreparedInsert(tableInfo);
+                inserts.put(tableInfo.getName(), st);
+                lastInsertMap.put(tableInfo.getName(), new LastInsert());
+            } catch (SQLException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    public int getMimeId(String mimeString) {
+        return mimeBuffer.getId(mimeString);
+    }
+
+    private PreparedStatement createPreparedInsert(TableInfo tableInfo) throws SQLException {
+        StringBuilder sb = new StringBuilder();
+        sb.append("INSERT INTO ").append(tableInfo.getName());
+        sb.append("(");
+        int i = 0;
+        for (ColInfo c : tableInfo.getColInfos()) {
+            if (i++ > 0) {
+                sb.append(", ");
+            }
+            sb.append(c.getName());
+        }
+        sb.append(") ");
+
+        sb.append("VALUES");
+        sb.append("(");
+        for (int j = 0; j < i; j++) {
+            if (j > 0) {
+                sb.append(", ");
+            }
+            sb.append("?");
+        }
+        sb.append(")");
+
+        return conn.prepareStatement(sb.toString());
+    }
+
+
+    @Override
+    public void writeRow(TableInfo table, Map<Cols, String> data) throws IOException {
+        try {
+            PreparedStatement p = inserts.get(table.getName());
+            if (p == null) {
+                throw new RuntimeException("Failed to create prepared statement for: "+
+                        table.getName());
+            }
+            dbUtil.batchInsert(p, table, data);
+            LastInsert lastInsert = lastInsertMap.get(table.getName());
+            lastInsert.rowCount++;
+            long elapsed = System.currentTimeMillis()-lastInsert.lastInsert;
+            if (
+                    //elapsed > commitEveryXMS ||
+                lastInsert.rowCount % commitEveryXRows == 0) {
+                LOG.info("writer ({}) on table ({}) is committing after {} rows and {} ms", myId,
+                        table.getName(),
+                        lastInsert.rowCount, elapsed);
+                p.executeBatch();
+                conn.commit();
+                lastInsert.lastInsert = System.currentTimeMillis();
+            }
+        } catch (SQLException e) {
+            throw new IOException(e);
+        }
+    }
+
+    /**
+     * This closes the writer by executing batch and
+     * committing changes.  This DOES NOT close the connection
+     * @throws IOException
+     */
+    public void close() throws IOException {
+        for (PreparedStatement p : inserts.values()) {
+            try {
+                p.executeBatch();
+            } catch (SQLException e) {
+                throw new IOExceptionWithCause(e);
+            }
+        }
+        try {
+            conn.commit();
+        } catch (SQLException e){
+            throw new IOExceptionWithCause(e);
+        }
+    }
+
+    private class LastInsert {
+        private long lastInsert = System.currentTimeMillis();
+        private long rowCount = 0;
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java
new file mode 100644
index 0000000..eb6be14
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.io;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class ExtractReader {
+    private static final Logger LOG = LoggerFactory.getLogger(ExtractReader.class);
+
+    public static final long IGNORE_LENGTH = -1L;
+
+    public enum ALTER_METADATA_LIST {
+        AS_IS,  //leave the metadata list as is
+        FIRST_ONLY, //take only the metadata list for the "container" document
+        CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into the first
+    }
+
+    private TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+
+    private final ALTER_METADATA_LIST alterMetadataList;
+    private final long minExtractLength;
+    private final long maxExtractLength;
+
+    /**
+     * Reads full extract, no modification of metadata list, no min or max extract length checking
+     */
+    public ExtractReader() {
+        this(ALTER_METADATA_LIST.AS_IS, IGNORE_LENGTH, IGNORE_LENGTH);
+    }
+
+    public ExtractReader(ALTER_METADATA_LIST alterMetadataList) {
+        this(alterMetadataList, IGNORE_LENGTH, IGNORE_LENGTH);
+    }
+
+    public ExtractReader(ALTER_METADATA_LIST alterMetadataList, long minExtractLength, long maxExtractLength) {
+        this.alterMetadataList = alterMetadataList;
+        this.minExtractLength = minExtractLength;
+        this.maxExtractLength = maxExtractLength;
+        if (maxExtractLength > IGNORE_LENGTH && minExtractLength >= maxExtractLength) {
+            throw new IllegalArgumentException("minExtractLength("+minExtractLength+
+                    ") must be < maxExtractLength("+maxExtractLength+")");
+        }
+    }
+    public List<Metadata> loadExtract(Path extractFile) throws ExtractReaderException {
+
+        List<Metadata> metadataList = null;
+        if (extractFile == null || !Files.isRegularFile(extractFile)) {
+            throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
+        }
+
+        FileSuffixes fileSuffixes = parseSuffixes(extractFile.getFileName().toString());
+        if (fileSuffixes.format == null) {
+            throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX);
+        }
+        if (! Files.isRegularFile(extractFile)) {
+            throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
+        }
+
+        long length = -1L;
+        try {
+            length = Files.size(extractFile);
+        } catch (IOException e) {
+            throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
+        }
+
+        if (length == 0L) {
+            throw new ExtractReaderException(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE);
+        }
+
+        if (minExtractLength > IGNORE_LENGTH && length < minExtractLength) {
+            throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_SHORT);
+        }
+        if (maxExtractLength > IGNORE_LENGTH && length > maxExtractLength) {
+            throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_LONG);
+        }
+
+        Reader reader = null;
+        InputStream is = null;
+        try {
+            is = Files.newInputStream(extractFile);
+            if (fileSuffixes.compression != null) {
+                if (fileSuffixes.compression.equals("bz2")) {
+                    is = new BZip2CompressorInputStream(is);
+                } else if (fileSuffixes.compression.equals("gz")
+                        || fileSuffixes.compression.equals("gzip")) {
+                    is = new GzipCompressorInputStream(is);
+                } else if (fileSuffixes.compression.equals("zip")) {
+                    is = new ZCompressorInputStream(is);
+                } else {
+                    LOG.warn("Can't yet process compression of type: {}", fileSuffixes.compression);
+                    return metadataList;
+                }
+            }
+            reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
+        } catch (IOException e) {
+            throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
+        }
+
+        try {
+            if (fileSuffixes.format == FileSuffixes.FORMAT.JSON) {
+                metadataList = JsonMetadataList.fromJson(reader);
+                if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) {
+                    while (metadataList.size() > 1) {
+                        metadataList.remove(metadataList.size()-1);
+                    }
+                } else if (alterMetadataList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) &&
+                        metadataList.size() > 1) {
+                    StringBuilder sb = new StringBuilder();
+                    Metadata containerMetadata = metadataList.get(0);
+                    for (Metadata m : metadataList) {
+                        String c = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+                        if (c != null) {
+                            sb.append(c);
+                            sb.append(" ");
+                        }
+                    }
+                    containerMetadata.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, sb.toString());
+                    while (metadataList.size() > 1) {
+                        metadataList.remove(metadataList.size()-1);
+                    }
+                }
+            } else {
+                metadataList = generateListFromTextFile(reader, fileSuffixes);
+            }
+        } catch (IOException e) {
+            throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
+        } catch (TikaException e) {
+            throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_PARSE_EXCEPTION);
+        } finally {
+            IOUtils.closeQuietly(reader);
+            IOUtils.closeQuietly(is);
+        }
+        return metadataList;
+    }
+
+    private List<Metadata> generateListFromTextFile(Reader reader,
+                                                           FileSuffixes fileSuffixes) throws IOException {
+        List<Metadata> metadataList = new ArrayList<>();
+        String content = IOUtils.toString(reader);
+        Metadata m = new Metadata();
+        m.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content);
+        if (fileSuffixes.format == FileSuffixes.FORMAT.HTML) {
+            m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToXMLContentHandler.class.getSimpleName());
+        } else if (fileSuffixes.format == FileSuffixes.FORMAT.TXT) {
+            m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToTextContentHandler.class.getSimpleName());
+        }
+        //Let's hope the file name has a suffix that can
+        //be used to determine the mime.  Could be wrong or missing,
+        //but better than nothing.
+        m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileSuffixes.originalFileName);
+
+        MediaType mimeType = tikaConfig.getMimeRepository().detect(null, m);
+        if (mimeType != null) {
+            m.set(Metadata.CONTENT_TYPE, mimeType.toString());
+        }
+        metadataList.add(m);
+        return metadataList;
+
+    }
+
+    protected static FileSuffixes parseSuffixes(String fName) {
+        FileSuffixes fileSuffixes = new FileSuffixes();
+        if (fName == null) {
+            return fileSuffixes;
+        }
+        Matcher m = Pattern.compile("(?i)^(.*?)\\.(json|txt|x?html)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName);
+        if (m.find()) {
+            fileSuffixes.originalFileName = m.group(1);
+            fileSuffixes.setFormat(m.group(2));
+            fileSuffixes.compression = m.group(3);
+        }
+        return fileSuffixes;
+    }
+
+    private static class FileSuffixes {
+
+        enum FORMAT {
+            TXT,
+            HTML,
+            JSON
+        }
+        String compression;
+        FORMAT format;
+        String originalFileName;
+
+        public void setFormat(String fmt) {
+            String lc = fmt.toLowerCase(Locale.ENGLISH);
+            if (lc.equals("json")) {
+                format = FORMAT.JSON;
+            } else if (lc.equals("txt")) {
+                format = FORMAT.TXT;
+            } else if (lc.contains("html")) {
+                format = FORMAT.HTML;
+            } else {
+                throw new IllegalArgumentException("extract must end in .json, .txt or .xhtml");
+            }
+        }
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReaderException.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReaderException.java
new file mode 100644
index 0000000..61e1f35
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReaderException.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.io;
+
+import java.io.IOException;
+
+/**
+ * Exception when trying to read extract
+ */
+public class ExtractReaderException extends IOException {
+
+    public enum TYPE {
+        //what do you see when you look at the extract file
+        NO_EXTRACT_FILE,
+        ZERO_BYTE_EXTRACT_FILE,
+        IO_EXCEPTION,
+        EXTRACT_PARSE_EXCEPTION,
+        EXTRACT_FILE_TOO_SHORT,
+        EXTRACT_FILE_TOO_LONG,
+        INCORRECT_EXTRACT_FILE_SUFFIX;//extract file must have suffix of .json or .txt,
+        // optionally followed by gzip, zip or bz2
+    }
+
+    private final TYPE type;
+
+    public ExtractReaderException(TYPE exceptionType) {
+        this.type = exceptionType;
+    }
+
+    public TYPE getType() {
+        return type;
+    }
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/IDBWriter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/IDBWriter.java
new file mode 100644
index 0000000..bb1d4ae
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/IDBWriter.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app.io;
+
+
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+
+import java.io.IOException;
+import java.util.Map;
+
+public interface IDBWriter {
+    public void writeRow(TableInfo table, Map<Cols, String> data) throws IOException;
+    public void close() throws IOException;
+    public int getMimeId(String mimeString);
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogMsgHandler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogMsgHandler.java
new file mode 100644
index 0000000..547784b
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogMsgHandler.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.io;
+
+import java.io.IOException;
+import java.sql.SQLException;
+
+import org.apache.log4j.Level;
+
+public interface XMLLogMsgHandler {
+    public void handleMsg(Level level, String xml) throws IOException, SQLException;
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogReader.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogReader.java
new file mode 100644
index 0000000..408ed97
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogReader.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.io;
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.sql.SQLException;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.Level;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class XMLLogReader {
+    private static final Logger LOG = LoggerFactory.getLogger(XMLLogReader.class);
+    //class that wraps a logger's xml output
+    //into a single xml parseable input stream.
+
+    public void read(InputStream xmlLogFileIs, XMLLogMsgHandler handler) throws XMLStreamException {
+        InputStream is = new LogXMLWrappingInputStream(xmlLogFileIs);
+        XMLInputFactory factory = XMLReaderUtils.getXMLInputFactory();
+        XMLStreamReader reader = factory.createXMLStreamReader(is);
+
+        Level level = null;
+        while (reader.hasNext()) {
+            reader.next();
+            switch (reader.getEventType()) {
+                case XMLStreamConstants.START_ELEMENT :
+                    if ("event".equals(reader.getLocalName())) {
+                        level = Level.toLevel(reader.getAttributeValue("", "level"), Level.DEBUG);
+                    } else if ("message".equals(reader.getLocalName())) {
+                        try {
+                            handler.handleMsg(level, reader.getElementText());
+                        } catch (IOException e) {
+                            LOG.warn("Error parsing: {}", reader.getElementText());
+                        } catch (SQLException e) {
+                            LOG.warn("SQLException: {}", e.getMessage());
+                        }
+                    }
+                    break;
+                case XMLStreamConstants.END_ELEMENT :
+                    if ("event".equals(reader.getLocalName())) {
+                        level = null;
+                    } else if ("message".equals(reader.getLocalName())) {
+                        //do we care any more?
+                    }
+                    break;
+            };
+        }
+    }
+
+
+
+    class LogXMLWrappingInputStream extends InputStream {
+        //plagiarized from log4j's chainsaw
+        private final static String HEADER =
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+                        + "<log4j:eventSet version=\"1.2\" "
+                        + "xmlns:log4j=\"http://jakarta.apache.org/log4j/\">";
+        private static final String FOOTER = "</log4j:eventSet>";
+
+        private InputStream[] streams;
+        int currentStreamIndex = 0;
+
+        private LogXMLWrappingInputStream(InputStream xmlLogFileIs){
+            streams = new InputStream[3];
+            streams[0] = new ByteArrayInputStream(HEADER.getBytes(StandardCharsets.UTF_8));
+            streams[1] = xmlLogFileIs;
+            streams[2] = new ByteArrayInputStream(FOOTER.getBytes(StandardCharsets.UTF_8));
+
+        }
+
+        @Override
+        public int read() throws IOException {
+            int c = streams[currentStreamIndex].read();
+            if (c < 0) {
+                IOUtils.closeQuietly(streams[currentStreamIndex]);
+                while (currentStreamIndex < streams.length-1) {
+                    currentStreamIndex++;
+                    int tmpC = streams[currentStreamIndex].read();
+                    if (tmpC < 0) {
+                        IOUtils.closeQuietly(streams[currentStreamIndex]);
+                    } else {
+                        return tmpC;
+                    }
+                }
+                return -1;
+            }
+            return c;
+        }
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/Report.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/Report.java
new file mode 100644
index 0000000..58d28a8
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/Report.java
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.reports;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.ResultSetMetaData;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.VerticalAlignment;
+import org.apache.poi.xssf.streaming.SXSSFSheet;
+import org.apache.poi.xssf.streaming.SXSSFWorkbook;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class represents a single report.
+ */
+public class Report {
+    private static final Logger LOG = LoggerFactory.getLogger(Report.class);
+
+    final String NULL_VALUE = "";//TODO: make this configurable!!!
+    Map<String, XSLXCellFormatter> cellFormatters = new HashMap<>();
+    private XLSXNumFormatter defaultDoubleFormatter = new XLSXNumFormatter("0.000");
+    private XLSXNumFormatter defaultIntegerFormatter = new XLSXNumFormatter("0");
+    private CellStyle sqlCellStyle;
+
+    String sql;
+    String reportFilename;
+    boolean includeSql = true;
+
+    String reportName;
+
+    public void writeReport(Connection c, Path reportsRoot) throws SQLException, IOException {
+        LOG.info("Writing report: {} to {}", reportName, reportFilename);
+        dumpXLSX(c, reportsRoot);
+    }
+
+    private void dumpXLSX(Connection c, Path reportsRoot) throws IOException, SQLException {
+        Statement st = c.createStatement();
+        Path out = reportsRoot.resolve(reportFilename);
+        Files.createDirectories(out.getParent());
+
+        SXSSFWorkbook wb = new SXSSFWorkbook(new XSSFWorkbook(), 100, true, true);
+        wb.setCompressTempFiles(true);
+        defaultIntegerFormatter.reset(wb.getXSSFWorkbook());
+        defaultDoubleFormatter.reset(wb.getXSSFWorkbook());
+        sqlCellStyle = wb.createCellStyle();
+        sqlCellStyle.setVerticalAlignment(VerticalAlignment.TOP);
+        sqlCellStyle.setWrapText(true);
+
+
+        try {
+            dumpReportToWorkbook(st, wb);
+        } finally {
+            try (OutputStream os = Files.newOutputStream(out)) {
+                wb.write(os);
+            } finally {
+                wb.dispose();
+            }
+        }
+    }
+
+    private void dumpReportToWorkbook(Statement st, SXSSFWorkbook wb) throws IOException, SQLException {
+        ResultSet rs = st.executeQuery(sql);
+
+        SXSSFSheet sheet = wb.createSheet("tika-eval Report");
+        sheet.trackColumnForAutoSizing(0);
+
+        int rowCount = 0;
+        ResultSetMetaData meta = rs.getMetaData();
+        Set<String> colNames = new HashSet<>();
+
+        Row xssfRow = sheet.createRow(rowCount++);
+        //write headers and cache them to check against styles
+        for (int i = 1; i <= meta.getColumnCount(); i++) {
+            Cell cell = xssfRow.createCell(i-1);
+            cell.setCellValue(meta.getColumnLabel(i));
+            colNames.add(meta.getColumnLabel(i));
+        }
+
+        ResultSetMetaData resultSetMetaData = rs.getMetaData();
+        while (rs.next()) {
+            xssfRow = sheet.createRow(rowCount++);
+            for (int i = 1; i <= meta.getColumnCount(); i++) {
+                Cell cell = xssfRow.createCell(i-1);
+                XSLXCellFormatter formatter = cellFormatters.get(meta.getColumnLabel(i));
+                if (formatter == null) {
+                    formatter = getDefaultFormatter(resultSetMetaData.getColumnType(i));
+                }
+                if (formatter != null) {
+                    formatter.applyStyleAndValue(i, rs, cell);
+                } else {
+                    writeCell(meta, i, rs, cell);
+                }
+            }
+        }
+        sheet.autoSizeColumn(0);
+
+        if (!includeSql) {
+            return;
+        }
+
+        SXSSFSheet sqlSheet = wb.createSheet("tika-eval SQL");
+        sqlSheet.setColumnWidth(0, 100*250);
+        Row sqlRow = sqlSheet.createRow(0);
+        short height = 5000;
+        sqlRow.setHeight(height);
+        Cell cell = sqlRow.createCell(0);
+        cell.setCellStyle(sqlCellStyle);
+
+        cell.setCellValue(sql.trim());//.replaceAll("[\r\n]+", "\r\n"));
+    }
+
+    private XSLXCellFormatter getDefaultFormatter(int columnType) {
+        switch (columnType) {
+            case Types.INTEGER :
+                return defaultIntegerFormatter;
+            case Types.DOUBLE:
+            case Types.FLOAT:
+            case Types.DECIMAL:
+                return defaultDoubleFormatter;
+            default:
+                return null;
+        }
+    }
+
+    private void writeCell(ResultSetMetaData meta, int colIndex, ResultSet rs,
+                           Cell cell) throws SQLException {
+
+        switch(meta.getColumnType(colIndex)) {
+            //fall through on numerics
+            case Types.BIGINT:
+            case Types.SMALLINT:
+            case Types.INTEGER:
+            case Types.DOUBLE:
+            case Types.FLOAT:
+            case Types.DECIMAL:
+            case Types.REAL:
+            case Types.NUMERIC:
+                double dbl = rs.getDouble(colIndex);
+                if (rs.wasNull()) {
+                    cell.setCellValue(NULL_VALUE);
+                } else {
+                    cell.setCellValue(dbl);
+                }
+                break;
+            //fall through strings
+            case Types.BOOLEAN:
+            case Types.CHAR:
+            case Types.VARCHAR:
+            case Types.LONGNVARCHAR:
+                String val = rs.getString(colIndex);
+                if (rs.wasNull()) {
+                    cell.setCellValue(NULL_VALUE);
+                } else {
+                    cell.setCellValue(val);
+                }
+                break;
+            default:
+                if (rs.wasNull()) {
+                    cell.setCellValue(NULL_VALUE);
+                } else {
+                    cell.setCellValue(rs.getString(colIndex));
+                }
+                LOG.warn("Couldn't find type for: {}. Defaulting to String", meta.getColumnType(colIndex));
+        }
+    }
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
new file mode 100644
index 0000000..4f765d1
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.reports;
+
+
+import javax.xml.parsers.DocumentBuilder;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.sql.Connection;
+import java.sql.DatabaseMetaData;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.poi.common.usermodel.HyperlinkType;
+import org.apache.tika.eval.app.ExtractComparer;
+import org.apache.tika.eval.app.ExtractProfiler;
+import org.apache.tika.eval.app.db.H2Util;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+public class ResultsReporter {
+    private static final Logger LOG = LoggerFactory.getLogger(ResultsReporter.class);
+
+    private static Options OPTIONS;
+
+    static {
+        OPTIONS = new Options();
+        OPTIONS.addOption("rd", "reportsDir", true, "directory for the reports. " +
+                "If not specified, will write to 'reports'" +
+                "BEWARE: Will overwrite existing reports without warning!")
+                .addOption("rf", "reportsFile", true, "xml specifying sql to call for the reports." +
+                        "If not specified, will use default reports in resources/tika-eval-*-config.xml")
+                .addOption("db", true, "default database (in memory H2). Specify a file name for the H2 database.")
+                .addOption("jdbc", true, "EXPERT: full jdbc connection string. Specify this or use -db <h2db_name>")
+                .addOption("jdbcdriver", true, "EXPERT: specify the jdbc driver class if all else fails")
+                .addOption("tablePrefix", true, "EXPERT: if not using the default tables, specify your table name prefix");
+
+    }
+
+    public static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(
+                80,
+                "java -jar tika-eval-x.y.jar Report -db mydb [-rd myreports] [-rf myreports.xml]",
+                "Tool: Report",
+                ResultsReporter.OPTIONS,
+                "Note: for h2 db, do not include the .mv.db at the end of the db name.");
+
+    }
+
+
+    List<String> before = new ArrayList<>();
+    List<String> after = new ArrayList<>();
+    List<Report> reports = new ArrayList<>();
+
+
+    private void addBefore(String b) {
+        before.add(b);
+    }
+
+    private void addAfter(String a) {
+        after.add(a);
+    }
+
+    private void addReport(Report r) {
+        reports.add(r);
+    }
+
+    public static ResultsReporter build(Path p) throws Exception {
+
+        ResultsReporter r = new ResultsReporter();
+
+        DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder();
+        Document doc;
+        try (InputStream is = Files.newInputStream(p)) {
+            doc = docBuilder.parse(is);
+        }
+        Node docElement = doc.getDocumentElement();
+        assert (docElement.getNodeName().equals("reports"));
+        NodeList children = docElement.getChildNodes();
+        for (int i = 0; i < children.getLength(); i++) {
+            Node n = children.item(i);
+            if ("before".equals(n.getNodeName())) {
+                for (String before : getSql(n)) {
+                    r.addBefore(before);
+                }
+            } else if ("after".equals(n.getNodeName())) {
+                for (String after : getSql(n)) {
+                    r.addAfter(after);
+                }
+            } else if ("report".equals(n.getNodeName())) {
+                Report report = buildReport(n);
+                r.addReport(report);
+            }
+        }
+
+        return r;
+    }
+
+    private static Report buildReport(Node n) {
+        NodeList children = n.getChildNodes();
+        Report r = new Report();
+        NamedNodeMap attrs = n.getAttributes();
+
+        r.includeSql = Boolean.parseBoolean(attrs.getNamedItem("includeSql").getNodeValue());
+        r.reportFilename = attrs.getNamedItem("reportFilename").getNodeValue();
+        r.reportName = attrs.getNamedItem("reportName").getNodeValue();
+
+        for (int i = 0; i < children.getLength(); i++) {
+            Node child = children.item(i);
+            if (child.getNodeType() != 1) {
+                continue;
+            }
+            if ("sql".equals(child.getNodeName())) {
+                if (r.sql != null) {
+                    throw new IllegalArgumentException("Can only have one sql statement per report");
+                }
+                r.sql = child.getTextContent();
+            } else if ("colformats".equals(child.getNodeName())) {
+                r.cellFormatters = getCellFormatters(child);
+            } else {
+                throw new IllegalArgumentException("Not expecting to see:" + child.getNodeName());
+            }
+        }
+        return r;
+    }
+
+    private static Map<String, XSLXCellFormatter> getCellFormatters(Node n) {
+        NodeList children = n.getChildNodes();
+        Map<String, XSLXCellFormatter> ret = new HashMap<>();
+        for (int i = 0; i < children.getLength(); i++) {
+            Node child = children.item(i);
+            if (child.getNodeType() != 1) {
+                continue;
+            }
+            NamedNodeMap attrs = child.getAttributes();
+            String columnName = attrs.getNamedItem("name").getNodeValue();
+            assert (!ret.containsKey(columnName));
+            String type = attrs.getNamedItem("type").getNodeValue();
+            if ("numberFormatter".equals(type)) {
+                String format = attrs.getNamedItem("format").getNodeValue();
+                XSLXCellFormatter f = new XLSXNumFormatter(format);
+                ret.put(columnName, f);
+            } else if ("urlLink".equals(type)) {
+                String base = "";
+                Node baseNode = attrs.getNamedItem("base");
+                if (baseNode != null) {
+                    base = baseNode.getNodeValue();
+                }
+                XLSXHREFFormatter f = new XLSXHREFFormatter(base, HyperlinkType.URL);
+                ret.put(columnName, f);
+            } else if ("fileLink".equals(type)) {
+                String base = "";
+                Node baseNode = attrs.getNamedItem("base");
+                if (baseNode != null) {
+                    base = baseNode.getNodeValue();
+                }
+                XLSXHREFFormatter f = new XLSXHREFFormatter(base, HyperlinkType.FILE);
+                ret.put(columnName, f);
+            }
+        }
+        return ret;
+    }
+
+    private static List<String> getSql(Node n) {
+        List<String> ret = new ArrayList<>();
+
+        NodeList children = n.getChildNodes();
+
+        for (int i = 0; i < children.getLength(); i++) {
+            Node child = children.item(i);
+            if (child.getNodeType() != 1) {
+                continue;
+            }
+            ret.add(child.getTextContent());
+        }
+        return ret;
+    }
+
+    public static void main(String[] args) throws Exception {
+
+        DefaultParser defaultCLIParser = new DefaultParser();
+        CommandLine commandLine = null;
+        try {
+            commandLine = defaultCLIParser.parse(OPTIONS, args);
+        } catch (ParseException e) {
+            System.out.println(e.getMessage());
+            USAGE();
+            return;
+        }
+        JDBCUtil dbUtil = null;
+        if (commandLine.hasOption("db")) {
+            String dbString = commandLine.getOptionValue("db");
+            if (dbString.endsWith(".mv.db")) {
+                dbString = dbString.substring(0, dbString.length()-6);
+                LOG.debug("trimming .mv.db from db name");
+            }
+            Path db = Paths.get(dbString);
+            if (!H2Util.databaseExists(db)) {
+                throw new RuntimeException("I'm sorry, but I couldn't find this h2 database: " + db);
+            }
+            dbUtil = new H2Util(db);
+        } else if (commandLine.hasOption("jdbc")) {
+            String driverClass = null;
+            if (commandLine.hasOption("jdbcdriver")) {
+                driverClass = commandLine.getOptionValue("jdbcdriver");
+            }
+            dbUtil = new JDBCUtil(commandLine.getOptionValue("jdbc"), driverClass);
+        } else {
+            System.err.println("Must specify either -db for the default in-memory h2 database\n" +
+                    "or -jdbc for a full jdbc connection string");
+            USAGE();
+            return;
+        }
+        try (Connection c = dbUtil.getConnection()) {
+            Path tmpReportsFile = null;
+            try {
+                ResultsReporter resultsReporter = null;
+                String reportsFile = commandLine.getOptionValue("rf");
+                if (reportsFile == null) {
+                    tmpReportsFile = getDefaultReportsConfig(c);
+                    resultsReporter = ResultsReporter.build(tmpReportsFile);
+                } else {
+                    resultsReporter = ResultsReporter.build(Paths.get(reportsFile));
+                }
+
+                Path reportsRootDirectory = Paths.get(commandLine.getOptionValue("rd", "reports"));
+                if (Files.isDirectory(reportsRootDirectory)) {
+                    LOG.warn("'Reports' directory exists.  Will overwrite existing reports.");
+                }
+
+                resultsReporter.execute(c, reportsRootDirectory);
+            } finally {
+                if (tmpReportsFile != null) {
+                    Files.delete(tmpReportsFile);
+                }
+            }
+        }
+    }
+
+    private static Path getDefaultReportsConfig(Connection c) throws IOException, SQLException {
+        DatabaseMetaData md = c.getMetaData();
+        String internalPath = null;
+        try (ResultSet rs = md.getTables(null, null, "%", null)) {
+            while (rs.next()) {
+                String tName = rs.getString(3);
+                if (ExtractComparer.CONTENTS_TABLE_B.getName().equalsIgnoreCase(tName)) {
+                    internalPath = "/comparison-reports.xml";
+                    break;
+                } else if (ExtractProfiler.PROFILE_TABLE.getName().equalsIgnoreCase(tName)) {
+                    internalPath = "/profile-reports.xml";
+                    break;
+                }
+            }
+        }
+
+        if (internalPath == null) {
+            throw new RuntimeException("Couldn't determine if this database was a 'profiler' or 'comparison' db");
+        }
+        Path tmp = Files.createTempFile("tmp-tika-reports", ".xml");
+        Files.copy(ResultsReporter.class.getResourceAsStream(internalPath), tmp, StandardCopyOption.REPLACE_EXISTING);
+        return tmp;
+    }
+
+    public void execute(Connection c, Path reportsDirectory) throws IOException, SQLException {
+        Statement st = c.createStatement();
+        for (String sql : before) {
+            LOG.info("processing before: {}", sql);
+            st.execute(sql);
+        }
+        for (Report r : reports) {
+            r.writeReport(c, reportsDirectory);
+        }
+        for (String sql : after) {
+            LOG.info("processing after: {}", sql);
+            st.execute(sql);
+        }
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXHREFFormatter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXHREFFormatter.java
new file mode 100644
index 0000000..dd4f52a
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXHREFFormatter.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.reports;
+
+import java.nio.file.Paths;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import org.apache.poi.common.usermodel.HyperlinkType;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.ss.usermodel.Font;
+import org.apache.poi.ss.usermodel.Hyperlink;
+import org.apache.poi.ss.usermodel.IndexedColors;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+
+public class XLSXHREFFormatter implements XSLXCellFormatter {
+    //xlsx files can only have this many hyperlinks
+    //if they have more Excel can't read the file
+    private static final int MAX_HYPERLINKS = 65000;
+
+
+    private final String urlBase;
+    private final HyperlinkType linkType;
+    private XSSFWorkbook workbook;
+    private CellStyle style;
+    private int links = 0;
+
+    public XLSXHREFFormatter(String urlBase,
+                             HyperlinkType hyperlinkType) {
+        this.urlBase = urlBase;
+        this.linkType = hyperlinkType;
+    }
+
+    @Override
+    public void reset(XSSFWorkbook workbook) {
+        this.workbook = workbook;
+        style = workbook.createCellStyle();
+        Font hlinkFont = workbook.createFont();
+        hlinkFont.setUnderline(Font.U_SINGLE);
+        hlinkFont.setColor(IndexedColors.BLUE.getIndex());
+        style.setFont(hlinkFont);
+        links = 0;
+
+    }
+
+    @Override
+    public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException {
+        if (links < MAX_HYPERLINKS) {
+            Hyperlink hyperlink = workbook.getCreationHelper().createHyperlink(linkType);
+            String path = resultSet.getString(dbColNum);
+            String address = urlBase+path;
+            hyperlink.setAddress(address);
+            cell.setHyperlink(hyperlink);
+            cell.setCellStyle(style);
+            String fName = Paths.get(path).getFileName().toString();
+            cell.setCellValue(fName);
+            links++;
+        } else {
+            //silently stop adding hyperlinks
+        }
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXNumFormatter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXNumFormatter.java
new file mode 100644
index 0000000..09c9712
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXNumFormatter.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app.reports;
+
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+class XLSXNumFormatter implements XSLXCellFormatter {
+
+    private final String formatString;
+    private CellStyle style;
+
+    XLSXNumFormatter(String formatString) {
+        this.formatString = formatString;
+    }
+
+
+    @Override
+    public void reset(XSSFWorkbook workbook) {
+        style = workbook.createCellStyle();
+        style.setDataFormat(workbook.getCreationHelper()
+                .createDataFormat().getFormat(formatString));
+    }
+
+    @Override
+    public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException {
+        double d = resultSet.getDouble(dbColNum);
+        if (resultSet.wasNull()) {
+
+        } else {
+            cell.setCellStyle(style);
+        }
+        cell.setCellValue(resultSet.getDouble(dbColNum));
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XSLXCellFormatter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XSLXCellFormatter.java
new file mode 100644
index 0000000..e5d1d6c
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XSLXCellFormatter.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.reports;
+
+
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+interface XSLXCellFormatter {
+
+    public void reset(XSSFWorkbook workbook);
+    public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException;
+}
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java
new file mode 100644
index 0000000..db792cb
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.utils.ProcessUtils;
+
+/**
+ * Utility class that runs TopCommonTokenCounter against a directory
+ * of table files (named {lang}_table.gz or leipzip-like afr_...-sentences.txt)
+ * and outputs common tokens files for each input table file in the output directory.
+ */
+public class BatchTopCommonTokenCounter {
+
+    public static void main(String[] args) throws Exception {
+
+        Path commonTokensDir = Paths.get(args[0]);
+        Path tableFileDir = Paths.get(args[1]);
+        Map<String, List<Path>> langFiles = LeipzigHelper.getFiles(tableFileDir);
+
+        for (Map.Entry<String, List<Path>> e : langFiles.entrySet()) {
+
+            String[] cmd = new String[e.getValue().size()+1];
+            Path commonTokensFile = commonTokensDir.resolve(e.getKey());
+            cmd[0] = ProcessUtils.escapeCommandLine(commonTokensFile.toAbsolutePath().toString());
+            for (int i = 0; i < e.getValue().size(); i++) {
+                cmd[i+1] =
+                        ProcessUtils.escapeCommandLine(
+                                e.getValue().get(i).toAbsolutePath().toString());
+            }
+            TopCommonTokenCounter.main(
+                cmd
+            );
+        }
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/CommonTokenOverlapCounter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/CommonTokenOverlapCounter.java
new file mode 100644
index 0000000..0221297
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/CommonTokenOverlapCounter.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import org.apache.tika.eval.core.tokens.CommonTokenCountManager;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+public class CommonTokenOverlapCounter {
+
+    public static void main(String[] args) throws Exception {
+        Path commonTokensDir = Paths.get(args[0]);
+        CommonTokenOverlapCounter counter = new CommonTokenOverlapCounter();
+        counter.execute(commonTokensDir);
+    }
+
+    private void execute(Path commonTokensDir) throws IOException {
+        List<String> langs = new ArrayList<>();
+        for (File f : commonTokensDir.toFile().listFiles()) {
+            langs.add(f.getName());
+        }
+        CommonTokenCountManager mgr = new CommonTokenCountManager(commonTokensDir, "");
+        for (int i = 0; i < langs.size()-1; i++) {
+            for (int j = i+1; j < langs.size(); j++) {
+                compare(langs.get(i), langs.get(j), mgr);
+            }
+        }
+    }
+
+    private void compare(String langA, String langB, CommonTokenCountManager mgr) {
+        int overlap = 0;
+        int denom = 0;
+        Set<String> setA = mgr.getTokens(langA);
+        Set<String> setB = mgr.getTokens(langB);
+        for (String a : setA) {
+            if (setB.contains(a)) {
+                overlap += 2;
+            }
+        }
+        denom = setA.size()+setB.size();
+        double percent = (double)overlap/(double)denom;
+        if (percent > 0.01) {
+            System.out.println(String.format(Locale.US, "%s %s %.2f", langA, langB, percent));
+        }
+    }
+
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigHelper.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigHelper.java
new file mode 100644
index 0000000..bec0a05
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigHelper.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class LeipzigHelper {
+
+    static Map<String, List<Path>> getFiles(Path leipzigDir) throws IOException {
+        Matcher tableMatcher = Pattern.compile("([a-z]+)_table(\\.txt)?(\\.gz)?$").matcher("");
+        Matcher leipzigMatcher = Pattern.compile("([a-z]{3,3})[-_].*$").matcher("");
+
+        Map<String, List<Path>> m = new TreeMap<>();
+        for (File f : leipzigDir.toFile().listFiles()) {
+            System.err.println(f);
+            String lang = null;
+            if (tableMatcher.reset(f.getName()).find()) {
+                lang = tableMatcher.group(1);
+            } else if (leipzigMatcher.reset(f.getName()).find()) {
+                lang = leipzigMatcher.group(1);
+            }
+            if (lang == null) {
+                System.err.println("couldn't find a lang: " + f);
+                continue;
+            }
+            List<Path> files = m.get(lang);
+            if (files == null) {
+                files = new ArrayList<>();
+            }
+            files.add(f.toPath());
+            m.put(lang, files);
+        }
+        return m;
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigSampler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigSampler.java
new file mode 100644
index 0000000..b49afe7
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigSampler.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+public class LeipzigSampler {
+
+    public static void main(String[] args) throws Exception {
+        int sentsPerLanguage = 10;
+        Path leipzigDir = Paths.get(args[0]);
+        Path sampleFile = Paths.get(args[1]);
+        LeipzigSampler leipzigSampler = new LeipzigSampler();
+        try (BufferedWriter writer = Files.newBufferedWriter(sampleFile, StandardCharsets.UTF_8)) {
+            leipzigSampler.execute(leipzigDir, sentsPerLanguage, writer);
+        }
+    }
+
+    private void execute(Path leipzigDir, int sentsPerLang, BufferedWriter writer) throws IOException {
+        Map<String, List<Path>> fileMap = LeipzigHelper.getFiles(leipzigDir);
+        for (Map.Entry<String, List<Path>> e : fileMap.entrySet()) {
+            List<String> sentences = new ArrayList<>();
+            for (Path p : e.getValue()) {
+                addSentences(p, sentences);
+            }
+            Collections.shuffle(sentences);
+            String lang = e.getKey();
+            for (int i = 0; i < sentsPerLang; i++) {
+                writer.write(row(lang, sentences.get(i)));
+            }
+        }
+    }
+
+    private void addSentences(Path p, List<String> sentences) throws IOException {
+        try (BufferedReader reader = Files.newBufferedReader(p, StandardCharsets.UTF_8)) {
+            String line = reader.readLine();
+            while (line != null) {
+                int tab = line.indexOf("\t");
+                if (tab > -1) {
+                    line = line.substring(tab + 1);
+                }
+                sentences.add(line);
+                line = reader.readLine();
+            }
+        }
+    }
+
+    private String row(String lang, String s) {
+        s = s.replaceAll("\\s+", " ");
+        return lang+"\t"+s+"\n";
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
new file mode 100644
index 0000000..181f329
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
@@ -0,0 +1,327 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CompositeReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafMetaData;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.MultiBits;
+import org.apache.lucene.index.MultiDocValues;
+import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
+import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.MultiTerms;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.OrdinalMap;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.StoredFieldVisitor;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.Version;
+
+/**
+ * <b>COPIED VERBATIM FROM LUCENE</b>
+ * This class forces a composite reader (eg a {@link
+ * MultiReader} or {@link DirectoryReader}) to emulate a
+ * {@link LeafReader}.  This requires implementing the postings
+ * APIs on-the-fly, using the static methods in {@link
+ * MultiTerms}, {@link MultiDocValues}, by stepping through
+ * the sub-readers to merge fields/terms, appending docs, etc.
+ *
+ * <p><b>NOTE</b>: this class almost always results in a
+ * performance hit.  If this is important to your use case,
+ * you'll get better performance by gathering the sub readers using
+ * {@link IndexReader#getContext()} to get the
+ * leaves and then operate per-LeafReader,
+ * instead of using this class.
+ */
+
+public final class SlowCompositeReaderWrapper extends LeafReader {
+
+    private final CompositeReader in;
+    private final LeafMetaData metaData;
+
+    // Cached copy of FieldInfos to prevent it from being re-created on each
+    // getFieldInfos call.  Most (if not all) other LeafReader implementations
+    // also have a cached FieldInfos instance so this is consistent. SOLR-12878
+    private final FieldInfos fieldInfos;
+
+    final Map<String, Terms> cachedTerms = new ConcurrentHashMap<>();
+
+    // TODO: consider ConcurrentHashMap ?
+    // TODO: this could really be a weak map somewhere else on the coreCacheKey,
+    // but do we really need to optimize slow-wrapper any more?
+    final Map<String, OrdinalMap> cachedOrdMaps = new HashMap<>();
+
+    /** This method is sugar for getting an {@link LeafReader} from
+     * an {@link IndexReader} of any kind. If the reader is already atomic,
+     * it is returned unchanged, otherwise wrapped by this class.
+     */
+    public static LeafReader wrap(IndexReader reader) throws IOException {
+        if (reader instanceof CompositeReader) {
+            return new SlowCompositeReaderWrapper((CompositeReader) reader);
+        } else {
+            assert reader instanceof LeafReader;
+            return (LeafReader) reader;
+        }
+    }
+
+    SlowCompositeReaderWrapper(CompositeReader reader) throws IOException {
+        in = reader;
+        in.registerParentReader(this);
+        if (reader.leaves().isEmpty()) {
+            metaData = new LeafMetaData(Version.LATEST.major, Version.LATEST, null);
+        } else {
+            Version minVersion = Version.LATEST;
+            for (LeafReaderContext leafReaderContext : reader.leaves()) {
+                Version leafVersion = leafReaderContext.reader().getMetaData().getMinVersion();
+                if (leafVersion == null) {
+                    minVersion = null;
+                    break;
+                } else if (minVersion.onOrAfter(leafVersion)) {
+                    minVersion = leafVersion;
+                }
+            }
+            metaData = new LeafMetaData(reader.leaves().get(0).reader().getMetaData().getCreatedVersionMajor(), minVersion, null);
+        }
+        fieldInfos = FieldInfos.getMergedFieldInfos(in);
+    }
+
+    @Override
+    public String toString() {
+        return "SlowCompositeReaderWrapper(" + in + ")";
+    }
+
+    @Override
+    public CacheHelper getReaderCacheHelper() {
+        return in.getReaderCacheHelper();
+    }
+
+    @Override
+    public CacheHelper getCoreCacheHelper() {
+        // TODO: this is trappy as the expectation is that core keys live for a long
+        // time, but here we need to bound it to the lifetime of the wrapped
+        // composite reader? Unfortunately some features seem to rely on this...
+        return in.getReaderCacheHelper();
+    }
+
+    @Override
+    public Terms terms(String field) throws IOException {
+        ensureOpen();
+        try {
+            return cachedTerms.computeIfAbsent(field, f -> {
+                try {
+                    return MultiTerms.getTerms(in, f);
+                } catch (IOException e) { // yuck!  ...sigh... checked exceptions with built-in lambdas are a pain
+                    throw new RuntimeException("unwrapMe", e);
+                }
+            });
+        } catch (RuntimeException e) {
+            if (e.getMessage().equals("unwrapMe") && e.getCause() instanceof IOException) {
+                throw (IOException) e.getCause();
+            }
+            throw e;
+        }
+    }
+
+    @Override
+    public NumericDocValues getNumericDocValues(String field) throws IOException {
+        ensureOpen();
+        return MultiDocValues.getNumericValues(in, field); // TODO cache?
+    }
+
+    @Override
+    public BinaryDocValues getBinaryDocValues(String field) throws IOException {
+        ensureOpen();
+        return MultiDocValues.getBinaryValues(in, field); // TODO cache?
+    }
+
+    @Override
+    public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
+        ensureOpen();
+        return MultiDocValues.getSortedNumericValues(in, field); // TODO cache?
+    }
+
+    @Override
+    public SortedDocValues getSortedDocValues(String field) throws IOException {
+        ensureOpen();
+        OrdinalMap map = null;
+        synchronized (cachedOrdMaps) {
+            map = cachedOrdMaps.get(field);
+            if (map == null) {
+                // uncached, or not a multi dv
+                SortedDocValues dv = MultiDocValues.getSortedValues(in, field);
+                if (dv instanceof MultiSortedDocValues) {
+                    map = ((MultiSortedDocValues)dv).mapping;
+                    CacheHelper cacheHelper = getReaderCacheHelper();
+                    if (cacheHelper != null && map.owner == cacheHelper.getKey()) {
+                        cachedOrdMaps.put(field, map);
+                    }
+                }
+                return dv;
+            }
+        }
+        int size = in.leaves().size();
+        final SortedDocValues[] values = new SortedDocValues[size];
+        final int[] starts = new int[size+1];
+        long totalCost = 0;
+        for (int i = 0; i < size; i++) {
+            LeafReaderContext context = in.leaves().get(i);
+            final LeafReader reader = context.reader();
+            final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
+            if (fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED) {
+                return null;
+            }
+            SortedDocValues v = reader.getSortedDocValues(field);
+            if (v == null) {
+                v = DocValues.emptySorted();
+            }
+            totalCost += v.cost();
+            values[i] = v;
+            starts[i] = context.docBase;
+        }
+        starts[size] = maxDoc();
+        return new MultiSortedDocValues(values, starts, map, totalCost);
+    }
+
+    @Override
+    public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
+        ensureOpen();
+        OrdinalMap map = null;
+        synchronized (cachedOrdMaps) {
+            map = cachedOrdMaps.get(field);
+            if (map == null) {
+                // uncached, or not a multi dv
+                SortedSetDocValues dv = MultiDocValues.getSortedSetValues(in, field);
+                if (dv instanceof MultiDocValues.MultiSortedSetDocValues) {
+                    map = ((MultiDocValues.MultiSortedSetDocValues)dv).mapping;
+                    CacheHelper cacheHelper = getReaderCacheHelper();
+                    if (cacheHelper != null && map.owner == cacheHelper.getKey()) {
+                        cachedOrdMaps.put(field, map);
+                    }
+                }
+                return dv;
+            }
+        }
+
+        assert map != null;
+        int size = in.leaves().size();
+        final SortedSetDocValues[] values = new SortedSetDocValues[size];
+        final int[] starts = new int[size+1];
+        long cost = 0;
+        for (int i = 0; i < size; i++) {
+            LeafReaderContext context = in.leaves().get(i);
+            final LeafReader reader = context.reader();
+            final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
+            if(fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED_SET){
+                return null;
+            }
+            SortedSetDocValues v = reader.getSortedSetDocValues(field);
+            if (v == null) {
+                v = DocValues.emptySortedSet();
+            }
+            values[i] = v;
+            starts[i] = context.docBase;
+            cost += v.cost();
+        }
+        starts[size] = maxDoc();
+        return new MultiDocValues.MultiSortedSetDocValues(values, starts, map, cost);
+    }
+
+    @Override
+    public NumericDocValues getNormValues(String field) throws IOException {
+        ensureOpen();
+        return MultiDocValues.getNormValues(in, field); // TODO cache?
+    }
+
+    @Override
+    public Fields getTermVectors(int docID) throws IOException {
+        ensureOpen();
+        return in.getTermVectors(docID);
+    }
+
+    @Override
+    public int numDocs() {
+        // Don't call ensureOpen() here (it could affect performance)
+        return in.numDocs();
+    }
+
+    @Override
+    public int maxDoc() {
+        // Don't call ensureOpen() here (it could affect performance)
+        return in.maxDoc();
+    }
+
+    @Override
+    public void document(int docID, StoredFieldVisitor visitor) throws IOException {
+        ensureOpen();
+        in.document(docID, visitor);
+    }
+
+    @Override
+    public Bits getLiveDocs() {
+        ensureOpen();
+        return MultiBits.getLiveDocs(in); // TODO cache?
+    }
+
+    @Override
+    public PointValues getPointValues(String field) {
+        ensureOpen();
+        return null; // because not supported.  Throw UOE?
+    }
+
+    @Override
+    public FieldInfos getFieldInfos() {
+        return fieldInfos;
+    }
+
+    @Override
+    protected void doClose() throws IOException {
+        // TODO: as this is a wrapper, should we really close the delegate?
+        in.close();
+    }
+
+    @Override
+    public void checkIntegrity() throws IOException {
+        ensureOpen();
+        for (LeafReaderContext ctx : in.leaves()) {
+            ctx.reader().checkIntegrity();
+        }
+    }
+
+    @Override
+    public LeafMetaData getMetaData() {
+        return metaData;
+    }
+}
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java
new file mode 100644
index 0000000..e632346
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java
@@ -0,0 +1,401 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.tika.eval.core.tokens.AnalyzerManager;
+import org.apache.tika.eval.core.tokens.URLEmailNormalizingFilterFactory;
+import org.apache.tika.utils.ProcessUtils;
+
+/**
+ * Utility class that reads in a UTF-8 input file with one document per row
+ * and outputs the 20000 tokens with the highest document frequencies.
+ *
+ * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
+ * but includes bigrams for cjk.
+ *
+ * It also has a include list for __email__ and __url__ and a skip list
+ * for common html markup terms.
+ */
+public class TopCommonTokenCounter {
+
+    private static String LICENSE =
+            "# Licensed to the Apache Software Foundation (ASF) under one or more\n" +
+            "# contributor license agreements.  See the NOTICE file distributed with\n" +
+            "# this work for additional information regarding copyright ownership.\n" +
+            "# The ASF licenses this file to You under the Apache License, Version 2.0\n" +
+            "# (the \"License\"); you may not use this file except in compliance with\n" +
+            "# the License.  You may obtain a copy of the License at\n" +
+            "#\n" +
+            "#     http://www.apache.org/licenses/LICENSE-2.0\n" +
+            "#\n" +
+            "# Unless required by applicable law or agreed to in writing, software\n" +
+            "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" +
+            "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" +
+            "# See the License for the specific language governing permissions and\n" +
+            "# limitations under the License.\n"+
+            "#\n";
+
+    private static final String FIELD = "f";
+    private static int TOP_N = 30000;
+    private static int MIN_DOC_FREQ = 10;
+    //these should exist in every list
+    static Set<String> INCLUDE_LIST = new HashSet<>(Arrays.asList(
+            new String[] {
+                    URLEmailNormalizingFilterFactory.URL,
+                    URLEmailNormalizingFilterFactory.EMAIL
+            }
+    ));
+
+    //words to ignore
+    //these are common 4 letter html markup words that we do
+    //not want to count in case of failed markup processing.
+    //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
+    static Set<String> SKIP_LIST = new HashSet<>(Arrays.asList(
+            "span",
+            "table",
+            "href",
+            "head",
+            "title",
+            "body",
+            "html",
+            "tagname",
+            "lang",
+            "style",
+            "script",
+            "strong",
+            "blockquote",
+            "form",
+            "iframe",
+            "section",
+            "colspan",
+            "rowspan"
+    ));
+
+    public static void main(String[] args) throws Exception {
+        Path commonTokensFile = Paths.get(args[0]);
+        List<Path> inputFiles = new ArrayList<>();
+        for (int i = 1; i < args.length; i++) {
+            inputFiles.add(Paths.get(
+                    ProcessUtils.unescapeCommandLine(args[i])));
+        }
+        TopCommonTokenCounter counter = new TopCommonTokenCounter();
+        if (Files.exists(commonTokensFile)) {
+            System.err.println(commonTokensFile.getFileName().toString()+
+                    " exists. I'm skipping this.");
+            return;
+        }
+        counter.execute(commonTokensFile, inputFiles);
+    }
+
+    private void execute(Path commonTokensFile, List<Path> inputFiles) throws Exception {
+        Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
+        AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
+        long totalDocs = -1;
+        long sumDocFreqs = -1;
+        long sumTotalTermFreqs = -1;
+        long uniqueTerms = -1;
+        try (Directory directory = FSDirectory.open(luceneDir)) {
+
+            AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);
+
+            Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
+            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
+            int maxLen = 1000000;
+            int len = 0;
+            try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
+                List<Document> docs = new ArrayList<>();
+                for (Path inputFile : inputFiles) {
+                    //total hack
+                    boolean isLeipzig = false;
+                    if (inputFile.getFileName().toString().contains("-sentences.txt")) {
+                        isLeipzig = true;
+                    }
+                    int lines = 0;
+                    try (BufferedReader reader = getReader(inputFile)) {
+                        String line = reader.readLine();
+                        while (line != null) {
+                            if (isLeipzig) {
+                                int tab = line.indexOf("\t");
+                                if (tab > -1) {
+                                    line = line.substring(tab+1);
+                                }
+                            }
+                            len += line.length();
+                            Document document = new Document();
+                            document.add(new TextField(FIELD, line, Field.Store.NO));
+                            docs.add(document);
+                            if (len > maxLen) {
+                                writer.addDocuments(docs);
+                                docs.clear();
+                                len = 0;
+                            }
+                            line = reader.readLine();
+                            if (++lines % 100000 == 0) {
+                                System.out.println("processed "+lines +
+                                        " for "+inputFile.getFileName()
+                                + " :: "+ commonTokensFile.toAbsolutePath());
+                            }
+                        }
+                    }
+                }
+                if (docs.size() > 0) {
+                    writer.addDocuments(docs);
+                }
+                writer.commit();
+                writer.flush();
+            }
+
+            try (IndexReader reader = DirectoryReader.open(directory)) {
+                LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
+                totalDocs = wrappedReader.getDocCount(FIELD);
+                sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
+                sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);
+
+                Terms terms = wrappedReader.terms(FIELD);
+                TermsEnum termsEnum = terms.iterator();
+                BytesRef bytesRef = termsEnum.next();
+                int docsWThisField = wrappedReader.getDocCount(FIELD);
+                while (bytesRef != null) {
+                    uniqueTerms++;
+                    int df = termsEnum.docFreq();
+                    long tf = termsEnum.totalTermFreq();
+                    if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
+                        bytesRef = termsEnum.next();
+                        continue;
+                    }
+
+                    if (queue.top() == null || queue.size() < TOP_N ||
+                            df >= queue.top().df) {
+                        String t = bytesRef.utf8ToString();
+                        if (! SKIP_LIST.contains(t)) {
+                            queue.insertWithOverflow(new TokenDFTF(t, df, tf));
+                        }
+
+                    }
+                    bytesRef = termsEnum.next();
+                }
+            }
+        } finally {
+            FileUtils.deleteDirectory(luceneDir.toFile());
+        }
+
+        writeTopN(commonTokensFile, totalDocs,
+                sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);
+
+
+    }
+
+    private BufferedReader getReader(Path inputFile) throws IOException {
+        InputStream is = Files.newInputStream(inputFile);
+        if (inputFile.toString().endsWith(".gz")) {
+            is = new GzipCompressorInputStream(is);
+        }
+        return new BufferedReader(
+                new InputStreamReader(is, StandardCharsets.UTF_8)
+        );
+    }
+
+    private static void writeTopN(Path path,
+                                  long totalDocs, long sumDocFreqs,
+                                  long sumTotalTermFreqs,
+                                  long uniqueTerms, AbstractTokenTFDFPriorityQueue queue) throws IOException {
+        if (Files.isRegularFile(path)) {
+            System.err.println("File "+path.getFileName() + " already exists. Skipping.");
+            return;
+        }
+        Files.createDirectories(path.getParent());
+        BufferedWriter writer =
+                Files.newBufferedWriter(path, StandardCharsets.UTF_8);
+        StringBuilder sb = new StringBuilder();
+        writer.write(LICENSE);
+        writer.write("#DOC_COUNT\t"+totalDocs+"\n");
+        writer.write("#SUM_DOC_FREQS\t"+sumDocFreqs+"\n");
+        writer.write("#SUM_TERM_FREQS\t"+sumTotalTermFreqs+"\n");
+        writer.write("#UNIQUE_TERMS\t"+uniqueTerms+"\n");
+        writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
+        //add these tokens no matter what
+        for (String t : INCLUDE_LIST) {
+            writer.write(t);
+            writer.newLine();
+        }
+        for (TokenDFTF tp : queue.getArray()) {
+            writer.write(getRow(sb, tp)+"\n");
+
+        }
+        writer.flush();
+        writer.close();
+    }
+
+    private static String getRow(StringBuilder sb, TokenDFTF tp) {
+        sb.setLength(0);
+        sb.append(clean(tp.token));
+        sb.append("\t").append(tp.df);
+        sb.append("\t").append(tp.tf);
+        return sb.toString();
+    }
+
+    private static String clean(String s) {
+        if (s == null) {
+            return "";
+        }
+        return s.replaceAll("\\s+", " ").trim();
+    }
+
+    private abstract class AbstractTokenTFDFPriorityQueue extends PriorityQueue<TokenDFTF> {
+
+        AbstractTokenTFDFPriorityQueue(int maxSize) {
+            super(maxSize);
+        }
+
+        public TokenDFTF[] getArray() {
+            TokenDFTF[] topN = new TokenDFTF[size()];
+            //now we reverse the queue
+            TokenDFTF term = pop();
+            int i = topN.length-1;
+            while (term != null && i > -1) {
+                topN[i--] = term;
+                term = pop();
+            }
+            return topN;
+        }
+    }
+
+    private class TokenDFTF {
+
+        final String token;
+        final int df;
+        final long tf;
+
+        public TokenDFTF(String token, int df, long tf) {
+            this.token = token;
+            this.df = df;
+            this.tf = tf;
+        }
+
+
+        public long getTF() {
+            return tf;
+        }
+
+        public int getDF() {
+            return df;
+        }
+
+        public String getToken() {
+            return token;
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            if (this == o) {
+                return true;
+            }
+            if (o == null || getClass() != o.getClass()) {
+                return false;
+            }
+
+            TokenDFTF tokenDFTF = (TokenDFTF) o;
+
+            if (df != tokenDFTF.df) {
+                return false;
+            }
+            if (tf != tokenDFTF.tf) {
+                return false;
+            }
+            return Objects.equals(token, tokenDFTF.token);
+        }
+
+        @Override
+        public int hashCode() {
+            int result = token != null ? token.hashCode() : 0;
+            result = 31 * result + df;
+            result = 31 * result + (int) (tf ^ (tf >>> 32));
+            return result;
+        }
+
+        @Override
+        public String toString() {
+            return "TokenDFTF{" +
+                    "token='" + token + '\'' +
+                    ", df=" + df +
+                    ", tf=" + tf +
+                    '}';
+        }
+    }
+
+    private class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {
+
+        TokenDFPriorityQueue(int maxSize) {
+            super(maxSize);
+        }
+
+        @Override
+        protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
+            if (arg0.df < arg1.df) {
+                return true;
+            } else if (arg0.df > arg1.df) {
+                return false;
+            }
+            return arg1.token.compareTo(arg0.token) < 0;
+        }
+
+        public TokenDFTF[] getArray() {
+            TokenDFTF[] topN = new TokenDFTF[size()];
+            //now we reverse the queue
+            TokenDFTF term = pop();
+            int i = topN.length-1;
+            while (term != null && i > -1) {
+                topN[i--] = term;
+                term = pop();
+            }
+            return topN;
+        }
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TrainTestSplit.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TrainTestSplit.java
new file mode 100644
index 0000000..9df2144
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TrainTestSplit.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
+
+public class TrainTestSplit {
+
+    private static String TRAINING = "train";
+    private static String TESTING = "test";
+    private static String DEVTEST = "devtest";
+
+    private float trainingP = 0.7f;
+    private float devTestP = 0.1f;
+    private float testP = 0.2f;
+    private Random random = new Random();
+
+    public static void main(String[] args) throws Exception {
+        Path leipzigDir = Paths.get(args[0]);
+        Path outputDir = Paths.get(args[1]);
+        TrainTestSplit splitter = new TrainTestSplit();
+        splitter.execute(leipzigDir, outputDir);
+    }
+
+    private void execute(Path leipzigDir, Path outputDir) throws Exception {
+        initOutDirs(outputDir);
+        for (File f : leipzigDir.toFile().listFiles()) {
+            if (f.isDirectory()) {
+                continue;
+            }
+            processFile(f, outputDir);
+        }
+    }
+
+    private void initOutDirs(Path outputDir) throws Exception {
+        for (String which : new String[] { TRAINING, DEVTEST, TESTING}) {
+            Path target = outputDir.resolve(which);
+            if (! Files.isDirectory(target)) {
+                Files.createDirectories(target);
+            }
+        }
+
+    }
+
+    private void processFile(File f, Path outputDir) throws Exception {
+        Map<String, BufferedWriter> writers = getWriters(outputDir, f);
+        System.err.println("working on "+f);
+        try (BufferedReader reader = Files.newBufferedReader(f.toPath(), StandardCharsets.UTF_8)) {
+            String line = reader.readLine();
+            while (line != null) {
+                float r = random.nextFloat();
+                if (r <= trainingP) {
+                    writers.get(TRAINING).write(line + "\n");
+                } else if (r < trainingP + devTestP) {
+                    writers.get(DEVTEST).write(line + "\n");
+                } else {
+                    writers.get(TESTING).write(line+"\n");
+                }
+                line = reader.readLine();
+            }
+        }
+
+
+        for (Writer w : writers.values()) {
+            w.flush();
+            w.close();
+        }
+    }
+
+    private Map<String, BufferedWriter> getWriters(Path outputDir, File f) throws IOException {
+        Map<String, BufferedWriter> writers = new HashMap<>();
+        for (String which : new String[] { TRAINING, DEVTEST, TESTING}) {
+            writers.put(which, getWriter(outputDir, which, f));
+        }
+        return writers;
+    }
+
+    private BufferedWriter getWriter(Path outputDir, String which, File f) throws IOException {
+        OutputStream os = new GzipCompressorOutputStream(
+                new BufferedOutputStream(
+                Files.newOutputStream(outputDir.resolve(which).resolve(f.getName()+".gz"))));
+        return new BufferedWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8));
+    }
+}
diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
new file mode 100644
index 0000000..5bcf88e
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
@@ -0,0 +1,1753 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<reports>
+
+
+    <before>
+
+        <sql>drop table if exists md5_multiples_tmp_a</sql>
+        <sql>create table md5_multiples_tmp_a (MD5, cnt)
+            as
+            select md5, count(1) as cnt
+            from profiles_a
+            where md5 is not null
+            group by md5
+            having count(1) &gt; 1
+            order by cnt desc
+        </sql>
+
+        <sql>drop table if exists md5_multiples_tmp_b</sql>
+        <sql>create table md5_multiples_tmp_b (MD5, cnt)
+            as
+            select md5, count(1) cnt
+            from profiles_b
+            where md5 is not null
+            group by md5
+            having count(1) &gt; 1
+            order by cnt desc
+        </sql>
+        <!-- build mime indexes -->
+
+        <sql>create index if not exists pa_m_idx
+            on profiles_a (mime_id);
+        </sql>
+
+        <sql>
+            create index if not exists pb_m_idx
+            on profiles_b (mime_id);
+        </sql>
+
+        <!-- build exceptions comparison table -->
+        <sql>drop table if exists exceptions_compared</sql>
+        <sql>
+            create table exceptions_compared (
+            mime_id_a integer,
+            mime_id_b integer,
+            total integer,
+            exc_cnt_a integer,
+            exc_cnt_b integer,
+            exc_prcnt_a float,
+            exc_prcnt_b float,
+            notes varchar(12)
+            );
+        </sql>
+        <sql>
+            insert into exceptions_compared (
+            select ma.mime_id, mb.mime_id, count(1) as total, 0, 0, 0.0, 0.0, ''
+            from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join mimes ma on pa.mime_id = ma.mime_id
+            join mimes mb on pb.mime_id = mb.mime_id
+            group by ma.mime_id, mb.mime_id
+            order by total desc );
+        </sql>
+
+        <sql>
+            update exceptions_compared ec set
+            exc_cnt_a = (
+            select count(1) as cnt
+            from exceptions_a ea
+            join profiles_a pa on ea.id=pa.id
+            join profiles_b pb on pb.id=pa.id
+            join mimes ma on pa.mime_id=ma.mime_id
+            join mimes mb on pb.mime_id=mb.mime_id
+            where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+            group by ma.mime_id, mb.mime_id);
+        </sql>
+        <sql>
+            update exceptions_compared ec set
+            exc_cnt_b = (
+            select count(1) as cnt
+            from exceptions_b eb
+            join profiles_b pb on eb.id=pb.id
+            join profiles_a pa on pa.id=pb.id
+            join mimes ma on pa.mime_id=ma.mime_id
+            join mimes mb on pb.mime_id=mb.mime_id
+            where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+            group by mb.mime_id, ma.mime_id);
+        </sql>
+        <sql>
+            update exceptions_compared
+            set exc_prcnt_a = cast(exc_cnt_a as decimal)/cast(total as decimal)
+            where total > 0;
+        </sql>
+        <sql>
+            update exceptions_compared
+            set exc_prcnt_b = cast(exc_cnt_b as decimal)/cast(total as decimal)
+            where total > 0;
+        </sql>
+
+        <sql>
+            update exceptions_compared
+            set notes = 'YAY!'
+            where total > 100 and (exc_prcnt_a-exc_prcnt_b) > 0.10;
+        </sql>
+        <sql>
+            update exceptions_compared
+            set notes = 'YIKES!'
+            where total > 100 and (exc_prcnt_b-exc_prcnt_a) > 0.10;
+        </sql>
+
+        <!-- build tmp common words table -->
+        <sql>drop table if exists token_counts_compared</sql>
+        <sql>
+            create table token_counts_compared
+            (mime_id_a integer,
+            mime_id_b integer,
+            num_tokens_a bigint default 0,
+            num_tokens_b bigint default 0,
+            num_alphabetic_tokens_a bigint default 0,
+            num_alphabetic_tokens_b bigint default 0,
+            num_common_tokens_a bigint default 0,
+            num_common_tokens_b bigint default 0
+            );
+        </sql>
+        <sql>
+            insert into token_counts_compared (mime_id_a, mime_id_b)
+            select ma.mime_id, mb.mime_id
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            group by ma.mime_id, mb.mime_id
+
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_tokens_a=(
+            select sum(num_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_tokens_b=(
+            select sum(num_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pa.id=pb.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_alphabetic_tokens_a=(
+            select sum(num_alphabetic_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_alphabetic_tokens_b=(
+            select sum(num_alphabetic_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pb.id=pa.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_common_tokens_a=(
+            select sum(num_common_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_common_tokens_b=(
+            select sum(num_common_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pa.id=pb.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>drop table if exists tags_by_mime</sql>
+        <sql>create table tags_by_mime (
+                mime_id_a integer,
+                mime_id_b integer,
+                tags_a_a integer,
+                tags_b_a integer,
+                tags_div_a integer,
+                tags_i_a integer,
+                tags_img_a integer,
+                tags_li_a integer,
+                tags_ol_a integer,
+                tags_p_a integer,
+                tags_table_a integer,
+                tags_td_a integer,
+                tags_title_a integer,
+                tags_tr_a integer,
+                tags_u_a integer,
+                tags_ul_a integer,
+                tags_a_b integer,
+                tags_b_b integer,
+                tags_div_b integer,
+                tags_i_b integer,
+                tags_img_b integer,
+                tags_li_b integer,
+                tags_ol_b integer,
+                tags_p_b integer,
+                tags_table_b integer,
+                tags_td_b integer,
+                tags_title_b integer,
+                tags_tr_b integer,
+                tags_u_b integer,
+                tags_ul_b integer
+            );
+        </sql>
+        <sql>
+            insert into tags_by_mime (mime_id_a, mime_id_b)
+            select ma.mime_id, mb.mime_id
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            group by ma.mime_id, mb.mime_id
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_a_a=(
+            select sum(ta.tags_a) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_b_a=(
+            select sum(ta.tags_b) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_div_a=(
+            select sum(ta.tags_div) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_i_a=(
+            select sum(ta.tags_i) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_img_a=(
+            select sum(ta.tags_img) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_li_a=(
+            select sum(ta.tags_li) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ol_a=(
+            select sum(ta.tags_ol) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_p_a=(
+            select sum(ta.tags_p) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_table_a=(
+            select sum(ta.tags_table) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_td_a=(
+            select sum(ta.tags_td) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_title_a=(
+            select sum(ta.tags_title) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_tr_a=(
+            select sum(ta.tags_tr) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_u_a=(
+            select sum(ta.tags_u) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ul_a=(
+            select sum(ta.tags_ul) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <!-- now update tags_b counts -->
+        <sql>
+            update tags_by_mime tbm set tags_a_b=(
+            select sum(tb.tags_a) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_b_b=(
+            select sum(tb.tags_b) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_div_b=(
+            select sum(tb.tags_div) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_i_b=(
+            select sum(tb.tags_i) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_img_b=(
+            select sum(tb.tags_img) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_li_b=(
+            select sum(tb.tags_li) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ol_b=(
+            select sum(tb.tags_ol) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_p_b=(
+            select sum(tb.tags_p) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_table_b=(
+            select sum(tb.tags_table) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_td_b=(
+            select sum(tb.tags_td) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_title_b=(
+            select sum(tb.tags_title) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_tr_b=(
+            select sum(tb.tags_tr) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_u_b=(
+            select sum(tb.tags_u) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ul_b=(
+            select sum(tb.tags_ul) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>drop table if exists tag_exceptions_by_mime</sql>
+        <sql>create table tag_exceptions_by_mime (
+            mime_id_a integer,
+            mime_id_b integer,
+            tag_exceptions_a integer,
+            tag_exceptions_b integer)
+        </sql>
+        <sql>
+            insert into tag_exceptions_by_mime (mime_id_a, mime_id_b,
+                tag_exceptions_a, tag_exceptions_b)
+            select ma.mime_id, mb.mime_id,0,0
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            group by ma.mime_id, mb.mime_id
+        </sql>
+        <sql>
+            update tag_exceptions_by_mime tebm set tag_exceptions_a=(
+            select count(1) as cnt from tags_a ta
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tebm.mime_id_b
+            and pa.mime_id=tebm.mime_id_a
+            and ta.tags_parse_exception=true
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tag_exceptions_by_mime tebm set tag_exceptions_b=(
+            select count(1) as cnt from tags_b tb
+            join profiles_a pa on pa.id=tb.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tebm.mime_id_b
+            and pa.mime_id=tebm.mime_id_a
+            and tb.tags_parse_exception=true
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            drop table if exists parse_time_compared;
+        </sql>
+        <sql>
+            create table parse_time_compared (
+            mime_id_a integer,
+            mime_id_b integer,
+            total_a bigint,
+            total_b bigint,
+            prcnt_increase double precision
+            );
+        </sql>
+            <sql>
+                insert into parse_time_compared (mime_id_a, mime_id_b,
+                total_a, total_b, prcnt_increase)
+                select ma.mime_id, mb.mime_id,0,0,0.0
+                from profiles_a a
+                join profiles_b b on a.id=b.id
+                join mimes ma on ma.mime_id=a.mime_id
+                join mimes mb on mb.mime_id=b.mime_id
+                group by ma.mime_id, mb.mime_id
+            </sql>
+        <sql>
+            update parse_time_compared ptc set total_a=(
+            select sum(pa.elapsed_time_millis) as total_a from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            where pa.mime_id= ptc.mime_id_a
+            and pb.mime_id=ptc.mime_id_b
+            group by mime_id_a, mime_id_b)
+        </sql>
+        <sql>
+            update parse_time_compared ptc set total_b=(
+            select sum(pb.elapsed_time_millis) as total_b from profiles_b pb
+            join profiles_a pa on pa.id=pb.id
+            where pa.mime_id= ptc.mime_id_a
+            and pb.mime_id=ptc.mime_id_b
+            group by mime_id_a, mime_id_b)
+        </sql>
+        <sql>
+            update parse_time_compared ptc set prcnt_increase=(100.0 *
+            cast(total_b as decimal)/cast(total_a as decimal))
+            where total_a > 0;
+        </sql>
+    </before>
+
+    <!-- MIMES -->
+    <report reportName="All Mimes In A"
+            reportFilename="mimes/all_mimes_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_a p
+            join mimes m on m.mime_id = p.mime_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="All Mimes In B"
+            reportFilename="mimes/all_mimes_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_b p
+            join mimes m on m.mime_id = p.mime_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Container Mimes In A"
+            reportFilename="mimes/container_mimes_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_a p
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=false
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="Container Mimes In B"
+            reportFilename="mimes/container_mimes_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_b p
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=false
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Embedded Mimes In A"
+            reportFilename="mimes/embedded_mimes_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_a p
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=true
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="Embedded Mimes In B"
+            reportFilename="mimes/embedded_mimes_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_b p
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=true
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Mime Differences A -> B"
+            reportFilename="mimes/mime_diffs_A_to_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select concat(ma.mime_string, ' -&gt; ', mb.mime_string) as
+            MIME_A_TO_MIME_B, count(1) as COUNT
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            where a.mime_id &lt;&gt; b.mime_id
+            group by MIME_A_TO_MIME_B
+            order by COUNT DESC
+        </sql>
+    </report>
+
+    <report reportName="Mime Differences A -> B Details"
+            reportFilename="mimes/mime_diffs_A_to_B_details.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select concat(ma.mime_string, ' -&gt; ', mb.mime_string) as
+            MIME_A_TO_MIME_B,
+            file_path,
+            c.length as CONTAINER_LENGTH,
+            a.file_name
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            join containers c on a.container_id=c.container_id
+            where a.mime_id &lt;&gt; b.mime_id
+            order by MIME_A_TO_MIME_B
+        </sql>
+    </report>
+
+
+    <!-- Exceptions -->
+    <report reportName="AllExceptionsByMimeA"
+            reportFilename="exceptions/exceptions_by_mime_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_a e
+            join profiles_a p on p.id=e.id
+            join mimes m on m.mime_id = p.mime_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="AllExceptionsByMimeB"
+            reportFilename="exceptions/exceptions_by_mime_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_b e
+            join profiles_b p on p.id=e.id
+            join mimes m on m.mime_id = p.mime_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="ContainerExceptionsByMimeA"
+            reportFilename="exceptions/container_exceptions_by_mime_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_a e
+            join profiles_a p on p.id=e.id
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=false
+            and parse_exception_id=0
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="ContainerExceptionsByMimeB"
+            reportFilename="exceptions/container_exceptions_by_mime_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_b e
+            join profiles_b p on p.id=e.id
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=false
+            and parse_exception_id=0
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="AllExceptionsByMimeByTypeA"
+            reportFilename="exceptions/exceptions_by_mime_by_type_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string as MIME_TYPE,
+            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+            from exceptions_a e
+            join profiles_a p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_id=p.mime_id
+            join ref_parse_exception_types r on
+            r.parse_exception_id=e.parse_exception_id
+            group by m.mime_string, parse_exception_description
+            order by MIME_TYPE, EXCEPTION_TYPE
+        </sql>
+    </report>
+
+    <report reportName="AllExceptionsByMimeByTypeB"
+            reportFilename="exceptions/exceptions_by_mime_by_type_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string as MIME_TYPE,
+            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+            from exceptions_b e
+            join profiles_b p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_id=p.mime_id
+            join ref_parse_exception_types r on
+            r.parse_exception_id=e.parse_exception_id
+            group by m.mime_string, parse_exception_description
+            order by MIME_TYPE, EXCEPTION_TYPE
+        </sql>
+    </report>
+
+    <report reportName="TextLostFromACausedByNewExceptionsInB"
+            reportFilename="exceptions/text_lost_from_A_caused_by_new_exceptions_in_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path as FILE_PATH,
+            c.length as CONTAINER_LENGTH,
+            ca.NUM_TOKENS as NUM_TOKENS_A,
+            cb.NUM_TOKENS as NUM_TOKENS_B,
+            ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A, cb.NUM_UNIQUE_TOKENS
+            as NUM_UNIQUE_TOKENS_B,
+            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
+            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
+            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
+            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+            ca.top_n_tokens as TOP_N_TOKENS_A, cb.top_n_tokens as TOP_N_TOKENS_B,
+            eb.ORIG_STACK_TRACE as ORIG_STACK_TRACE_B
+            from contents_a ca
+            join profiles_a pa on ca.id = pa.id
+            join containers c on pa.container_id=c.container_id
+            left join contents_b cb on ca.id=cb.id
+            left join exceptions_b eb on ca.id = eb.id
+            left join exceptions_a ea on ca.id = ea.id
+            where eb.orig_stack_trace is not null
+            and ea.orig_stack_trace is null
+            order by ca.num_common_tokens - coalesce(cb.num_common_tokens,0) desc
+        </sql>
+    </report>
+
+    <report reportName="FixedExceptionsInBByMimeType"
+            reportFilename="exceptions/fixed_exceptions_in_B_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select
+            ma.mime_string as MIME_TYPE_A,
+            mb.mime_string as MIME_TYPE_B,
+            count(1) as COUNT
+            from exceptions_a ea
+            left join exceptions_b eb on ea.id = eb.id
+            join profiles_a pa on pa.id=ea.id
+            join profiles_b pb on pa.id=pb.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            where eb.id is null
+            and ea.parse_exception_id=0
+            group by mime_type_a, mime_type_b
+        </sql>
+    </report>
+
+    <report reportName="FixedExceptionsInByDetails"
+            reportFilename="exceptions/fixed_exceptions_in_B_details.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select
+            file_path,
+            c.length as CONTAINER_LENGTH,
+            ma.mime_string as MIME_TYPE_A,
+            mb.mime_string as MIME_TYPE_B,
+            pa.file_name, pa.is_embedded
+            from exceptions_a ea
+            left join exceptions_b eb on ea.id = eb.id
+            join profiles_a pa on pa.id=ea.id
+            join profiles_b pb on pb.id=pa.id --this ensures that files were actually processed in both runs
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            where eb.id is null
+            and ea.parse_exception_id=0
+            order by mime_type_a, mime_type_b
+        </sql>
+    </report>
+    <report reportName="ContentsOfFixedExceptionsInB"
+            reportFilename="exceptions/contents_of_fixed_exceptions_in_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            c.length as CONTAINER_LENGTH,
+            ma.mime_string as MIME_TYPE_A,
+            mb.mime_string as MIME_TYPE_B,
+            CONTENT_LENGTH,
+            NUM_TOKENS, NUM_UNIQUE_TOKENS,
+            TOP_N_TOKENS, LANG_ID_1,TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV
+            from exceptions_a ea
+            left join exceptions_b eb on ea.id = eb.id
+            join profiles_a pa on pa.id=ea.id
+            join profiles_b pb on pa.id=pb.id
+            join contents_b cb on cb.id=ea.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            where eb.id is null
+            and ea.parse_exception_id=0
+        </sql>
+    </report>
+
+    <report reportName="NewExceptionsByMimeType"
+            reportFilename="exceptions/new_exceptions_in_B_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select ma.mime_string as MIME_TYPE_A, mb.mime_string as MIME_TYPE_B, count(1) as COUNT
+            from exceptions_b eb
+            left join exceptions_a ea on ea.id = eb.id
+            join profiles_a pa on pa.id=eb.id
+            join profiles_b pb on pb.id=pa.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            where ea.id is null
+            and eb.parse_exception_id=0
+            group by ma.mime_string, mb.mime_string
+            order by COUNT desc
+        </sql>
+    </report>
+
+    <report reportName="NewExceptionsInBByMimeTypeByStackTrace"
+            reportFilename="exceptions/new_exceptions_in_B_by_mime_by_stack_trace.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select
+            ma.MIME_STRING as MIME_TYPE_A,
+            mb.MIME_STRING as MIME_TYPE_B,
+            eb.sort_stack_trace, count(1) as
+            COUNT
+            from exceptions_b eb
+            left join exceptions_a ea on ea.id = eb.id
+            join profiles_a pa on pa.id=eb.id
+            join profiles_b pb on pb.id=eb.id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            where ea.id is null
+            and eb.parse_exception_id=0
+            group by MIME_TYPE_A, MIME_TYPE_B, eb.sort_stack_trace
+            order by MIME_TYPE_A asc, MIME_TYPE_B asc, COUNT desc
+        </sql>
+    </report>
+
+    <report reportName="NewExceptionsInBDetails"
+            reportFilename="exceptions/new_exceptions_in_B_details.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            c.length as CONTAINER_LENGTH,
+            ma.mime_string as MIME_TYPE_A,
+            mb.mime_string as MIME_TYPE_B,
+            eb.orig_stack_trace, eb.sort_stack_trace
+            from exceptions_b eb
+            left join exceptions_a ea on ea.id = eb.id
+            join profiles_a pa on pa.id=eb.id
+            join profiles_b pb on pb.id=eb.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            where ea.id is null
+            and eb.parse_exception_id=0
+            order by MIME_TYPE_A asc, MIME_TYPE_B asc, eb.ORIG_STACK_TRACE
+        </sql>
+    </report>
+
+    <report reportName="StackTracesByMimeInA"
+            reportFilename="exceptions/stack_traces_by_mime_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
+            COUNT
+            from exceptions_a e
+            join profiles_a p on p.id=e.id
+            join mimes m on m.mime_id=p.mime_id
+            and e.parse_exception_id=0
+            group by MIME_TYPE, e.sort_stack_trace
+            order by MIME_TYPE asc, COUNT desc
+        </sql>
+    </report>
+
+    <report reportName="AllStackTracesInA"
+            reportFilename="exceptions/stack_traces_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            c.length as CONTAINER_LENGTH,
+            mime_string as MIME_TYPE,
+            orig_stack_trace, sort_stack_trace
+            from exceptions_a e
+            join profiles_a p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_id=p.mime_id
+            and e.parse_exception_id=0
+            order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
+            CONTAINER_LENGTH asc
+        </sql>
+    </report>
+    <report reportName="AllStackTracesInB"
+            reportFilename="exceptions/stack_traces_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            c.length as CONTAINER_LENGTH,
+            mime_string as MIME_TYPE,
+            orig_stack_trace, sort_stack_trace
+            from exceptions_b e
+            join profiles_b p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_id=p.mime_id
+            and e.parse_exception_id=0
+            order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
+            CONTAINER_LENGTH asc
+        </sql>
+    </report>
+
+    <report reportName="StackTracesByMimeInB"
+            reportFilename="exceptions/stack_traces_by_mime_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
+            COUNT
+            from exceptions_b e
+            join profiles_b p on p.id=e.id
+            join mimes m on m.mime_id=p.mime_id
+            and e.parse_exception_id=0
+            group by MIME_TYPE, e.sort_stack_trace
+            order by MIME_TYPE asc, COUNT desc
+        </sql>
+    </report>
+    <report reportName="extractExceptionsA"
+            reportFilename="exceptions/extract_exceptions_a.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select file_path, extract_exception_description
+            from extract_exceptions_a e
+            join ref_extract_exception_types t
+            on e.extract_exception_id=t.extract_exception_id
+        </sql>
+    </report>
+    <report reportName="extractExceptionsB"
+            reportFilename="exceptions/extract_exceptions_b.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select file_path, extract_exception_description
+            from extract_exceptions_b e
+            join ref_extract_exception_types t
+            on e.extract_exception_id=t.extract_exception_id
+        </sql>
+    </report>
+    <report reportName="parseExceptionTypesA"
+            reportFilename="exceptions/overall_exception_types_a.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select parse_exception_description, count(1)
+            from exceptions_a e
+            join ref_parse_exception_types t on
+            t.parse_exception_id=e.parse_exception_id
+            group by t.parse_exception_description
+        </sql>
+    </report>
+    <report reportName="parseExceptionTypesB"
+            reportFilename="exceptions/overall_exception_types_b.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select parse_exception_description, count(1)
+            from exceptions_b e
+            join ref_parse_exception_types t on
+            t.parse_exception_id=e.parse_exception_id
+            group by t.parse_exception_description
+        </sql>
+    </report>
+
+    <report reportName="contentDiffsWExceptions"
+            reportFilename="content/content_diffs_with_exceptions.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            c.length as CONTAINER_LENGTH,
+            ma.mime_string as MIME_STRING_A,
+            mb.mime_string as MIME_STRING_B,
+            ca.num_unique_tokens as NUM_UNIQUE_TOKENS_A,
+            cb.num_unique_tokens as NUM_UNIQUE_TOKENS_B,
+            ca.num_tokens as NUM_TOKENS_A,
+            cb.num_tokens as NUM_TOKENS_B,
+            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
+            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
+            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
+            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+            coalesce(cb.num_common_tokens,0)-
+            coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
+            ca.top_n_tokens as TOP_N_TOKENS_A,
+            cb.top_n_tokens as TOP_N_TOKENS_B,
+            ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
+            cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
+            top_10_unique_token_diffs_a,
+            top_10_unique_token_diffs_b,
+            top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap,
+            ref_ea.parse_exception_description as EXCEPTION_A,
+            ref_eb.parse_exception_description as EXCEPTION_B
+            from content_comparisons cc
+            join contents_a ca on ca.id=cc.id
+            left join contents_b cb on cb.id=cc.id
+            join profiles_a pa on pa.id = cc.id
+            join profiles_b pb on pb.id=cc.id
+            join containers c on c.container_id=pa.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            left join exceptions_a ea on ea.id=cc.id
+            left join exceptions_b eb on eb.id=cc.id
+            left join ref_parse_exception_types ref_ea on ref_ea.parse_exception_id=ea.parse_exception_id
+            left join ref_parse_exception_types ref_eb on ref_eb.parse_exception_id=eb.parse_exception_id
+            where (overlap &lt; 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) &gt;30)
+            and (ea.parse_exception_id is null or
+            ea.parse_exception_id &lt;&gt; 2)
+            and (eb.parse_exception_id is null or
+            eb.parse_exception_id &lt;&gt; 2)
+            order by ma.mime_string, overlap asc
+            limit 100000
+        </sql>
+    </report>
+    <report reportName="contentDiffsNoExceptions"
+            reportFilename="content/content_diffs_no_exceptions.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            c.length as CONTAINER_LENGTH,
+            ma.mime_string as MIME_STRING_A,
+            mb.mime_string as MIME_STRING_B,
+            ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A,
+            cb.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_B,
+            ca.NUM_TOKENS as NUM_TOKENS_A,
+            cb.NUM_TOKENS as NUM_TOKENS_B,
+            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
+            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
+            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
+            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+            coalesce(cb.num_common_tokens,0)-
+            coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
+            ca.top_n_tokens as TOP_N_TOKENS_A,
+            cb.top_n_tokens as TOP_N_TOKENS_B,
+            ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
+            cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
+            top_10_unique_token_diffs_a,
+            top_10_unique_token_diffs_b,
+            top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap
+            from content_comparisons cc
+            join contents_a ca on ca.id=cc.id
+            join contents_b cb on cb.id=cc.id
+            join profiles_a pa on pa.id = cc.id
+            join profiles_b pb on pb.id=cc.id
+            join containers c on c.container_id=pa.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            left join exceptions_a ea on ea.id=cc.id
+            left join exceptions_b eb on eb.id=cc.id
+            where (overlap &lt; 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) &gt;30)
+            and (ea.parse_exception_id is null)
+            and (eb.parse_exception_id is null)
+            order by ma.mime_string, overlap asc
+            limit 100000
+        </sql>
+    </report>
+
+    <report reportName="CommonTokenComparisonsByMimeType"
+            reportFilename="content/common_token_comparisons_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select ma.mime_string as MIME_STRING_A, mb.mime_string as MIME_STRING_B,
+            num_tokens_a, num_tokens_b,
+            num_alphabetic_tokens_a, num_alphabetic_tokens_b,
+            num_common_tokens_a, num_common_tokens_b,
+            coalesce(num_common_tokens_b, 0)-coalesce(num_common_tokens_a, 0) as change_in_common_tokens_b
+            from token_counts_compared tcc
+            join mimes ma on tcc.mime_id_a = ma.mime_id
+            join mimes mb on tcc.mime_id_b = mb.mime_id
+            order by change_in_common_tokens_b desc
+        </sql>
+    </report>
+    <report reportName="PageCountDiffs"
+            reportFilename="content/page_count_diffs.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            c.length as CONTAINER_LENGTH,
+            ma.mime_string as MIME_STRING_A,
+            mb.mime_string as MIME_STRING_B,
+            pa.num_pages as NUM_PAGES_A,
+            pb.num_pages as NUM_PAGES_B,
+            (pb.num_pages-pa.num_pages) as DIFF_NUM_PAGES_IN_B
+            from profiles_a pa
+            join profiles_b pb on pa.id = pb.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            where pa.num_pages is not null
+            and pb.num_pages is not null
+            and pa.num_pages &lt;&gt; pb.num_pages
+            order by DIFF_NUM_PAGES_IN_B asc
+            limit 10000;
+        </sql>
+    </report>
+
+
+    <report reportName="ExceptionComparisonsByMimeType"
+            reportFilename="exceptions/exceptions_compared_by_mime_type.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select ma.mime_string as mime_string_a, mb.mime_string as mime_string_b,
+            total, exc_cnt_a,
+            exc_cnt_b,
+            exc_prcnt_a,
+            exc_prcnt_b, notes
+
+            from exceptions_compared e
+            join mimes ma on ma.mime_id=e.mime_id_a
+            join mimes mb on mb.mime_id=e.mime_id_b
+            order by (exc_prcnt_b-exc_prcnt_a) desc, total desc;
+        </sql>
+    </report>
+    <!--    <report reportName="MD5 Duplicate Counts A"
+                reportFilename="md5/md5_duplicate_counts_A.xlsx"
+                format="xlsx"
+                            includeSql="true">
+            <sql>
+                select md5, count(1) cnt
+                from profiles_a
+                group by md5
+                having cnt > 2
+                order by cnt desc
+            </sql>
+        </report>
+
+        <report reportName="MD5 Duplicate Counts B"
+                reportFilename="md5/md5_duplicate_counts_B.xlsx"
+                format="xlsx"
+                            includeSql="true">
+
+            <sql>
+                select md5, count(1) cnt
+                from profiles_b
+                group by md5
+                having cnt > 2
+                order by cnt desc
+            </sql>
+        </report>
+
+        <report reportName="MD5 Duplicates A"
+                reportFilename="md5/md5_duplicates_A.xlsx"
+                format="xlsx"
+                            includeSql="true">
+
+            <sql>
+                select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5
+                from md5_multiples_tmp_a t
+                join profiles_a p on p.md5 = t.md5
+                join containers c on p.container_id = c.container_id
+                join contents_a cb on p.id=cb.id
+                order by t.cnt desc
+            </sql>
+        </report>
+
+        <report reportName="MD5 Duplicates B"
+                reportFilename="md5/md5_duplicates_B.xlsx"
+                format="xlsx"
+                            includeSql="true">
+
+            <sql>
+                select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5
+                from md5_multiples_tmp_b t
+                join profiles_b p on p.md5 = t.md5
+                join containers c on p.container_id = c.container_id
+                join contents_b cb on p.id=cb.id
+                order by t.cnt desc
+            </sql>
+        </report>
+    -->
+
+    <report reportName="Attachment Diffs no Exceptions"
+            reportFilename="attachments/attachment_diffs_no_exceptions.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            c.length as CONTAINER_LENGTH,
+            ma.mime_string as MIME_STRING_A,
+            mb.mime_string as MIME_STRING_B,
+            pa.num_attachments as NUM_ATTACHMENTS_A,
+            pb.num_attachments as NUM_ATTACHMENTS_B,
+            pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B
+            from profiles_a pa
+            join profiles_b pb on pa.id= pb.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on pa.mime_id=ma.mime_id
+            join mimes mb on pb.mime_id=mb.mime_id
+            left join exceptions_a ea on ea.id=pa.id
+            left join exceptions_b eb on eb.id=pb.id
+            where pa.is_embedded=false and
+            ea.parse_exception_id is null and
+            eb.parse_exception_id is null
+            and pa.num_attachments &lt;&gt; pb.num_attachments
+            order by ma.mime_string, pb.num_attachments-pa.num_attachments
+            limit 100000;
+        </sql>
+    </report>
+
+    <report reportName="Attachment Diffs with exceptions"
+            reportFilename="attachments/attachment_diffs_with_exceptions.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            c.length as CONTAINER_LENGTH,
+            ma.mime_string as MIME_STRING_A,
+            mb.mime_string as MIME_STRING_B,
+            pa.num_attachments as NUM_ATTACHMENTS_A,
+            pb.num_attachments as NUM_ATTACHMENTS_B,
+            pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B,
+            refea.parse_exception_description as PARSE_EXCEPTION_A,
+            refeb.parse_exception_description as PARSE_EXCEPTION_B
+            from profiles_a pa
+            join profiles_b pb on pa.id= pb.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on pa.mime_id=ma.mime_id
+            join mimes mb on pb.mime_id=mb.mime_id
+            left join exceptions_a ea on ea.id=pa.id
+            left join exceptions_b eb on eb.id=pb.id
+            left join ref_parse_exception_types refea on ea.parse_exception_id=refea.parse_exception_id
+            left join ref_parse_exception_types refeb on eb.parse_exception_id=refeb.parse_exception_id
+            where pa.is_embedded=false
+            and pa.num_attachments &lt;&gt; pb.num_attachments
+            order by ma.mime_string, pb.num_attachments-pa.num_attachments
+            limit 100000;
+        </sql>
+    </report>
+
+    <report reportName="Files missing in B by Mime"
+            reportFilename="attachments/all_files_missing_in_B_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_a pa
+            left join profiles_b pb on pa.id=pb.id
+            join mimes m on pa.mime_id=m.mime_id
+            where pb.id is null
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Container files missing in B by Mime"
+            reportFilename="attachments/container_files_missing_in_B_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_a pa
+            left join profiles_b pb on pa.id=pb.id
+            join mimes m on pa.mime_id=m.mime_id
+            where pb.id is null and pa.is_embedded=false
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Embedded files missing in B by Mime"
+            reportFilename="attachments/embedded_files_missing_in_B_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_a pa
+            left join profiles_b pb on pa.id=pb.id
+            join mimes m on pa.mime_id=m.mime_id
+            where pb.id is null and pa.is_embedded=true
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="All files missing in A by Mime"
+            reportFilename="attachments/all_files_missing_in_A_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_b pb
+            left join profiles_a pa on pb.id=pa.id
+            join mimes m on pb.mime_id=m.mime_id
+            where pa.id is null
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Container files missing in A by Mime"
+            reportFilename="attachments/container_files_missing_in_A_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_b pb
+            left join profiles_a pa on pb.id=pa.id
+            join mimes m on pb.mime_id=m.mime_id
+            where pa.id is null and pb.is_embedded=false
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Embedded files missing in A by Mime"
+            reportFilename="attachments/embedded_files_missing_in_A_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_b pb
+            left join profiles_a pa on pb.id=pa.id
+            join mimes m on pb.mime_id=m.mime_id
+            where pa.id is null and pb.is_embedded=true
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <!-- metadata values -->
+    <report reportName="Metadata Value Diffs"
+            reportFilename="metadata/metadata_value_count_diffs.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            ma.mime_string as mime_string_a,
+            mb.mime_string as mime_string_b,
+            pa.num_metadata_values as num_metadata_values_a,
+            pb.num_metadata_values as num_metadata_values_b,
+            ea.parse_exception_id as parse_ex_id_a,
+            eb.parse_exception_id as parse_ex_id_b
+            from profiles_a pa
+            join profiles_b pb on pa.id= pb.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on pa.mime_id=ma.mime_id
+            join mimes mb on pb.mime_id=mb.mime_id
+            left join exceptions_a ea on ea.id=pa.id
+            left join exceptions_b eb on eb.id=pb.id
+            where
+            ea.parse_exception_id is null and
+            eb.parse_exception_id is null
+            and pa.num_metadata_values &lt;&gt; pb.num_metadata_values
+            order by ma.mime_string,
+            pb.num_metadata_values-pa.num_metadata_values
+            limit 100000
+        </sql>
+    </report>
+    <report reportName="Tag Count Diffs By Mime"
+            reportFilename="tags/tag_count_diffs_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select ma.mime_string as mime_string_a,
+            mb.mime_string as mime_string_b,
+            tags_a_a,
+            tags_a_b,
+            tags_b_a,
+            tags_b_b,
+            tags_div_a,
+            tags_div_b,
+            tags_i_a,
+            tags_i_b,
+            tags_li_a,
+            tags_li_b,
+            tags_ol_a,
+            tags_ol_b,
+            tags_p_a,
+            tags_p_b,
+            tags_table_a,
+            tags_table_b,
+            tags_td_a,
+            tags_td_b,
+            tags_title_a,
+            tags_title_b,
+            tags_tr_a,
+            tags_tr_b,
+            tags_u_a,
+            tags_u_b,
+            tags_ul_a,
+            tags_ul_b
+            from
+            tags_by_mime tbm
+            join mimes ma on tbm.mime_id_a=ma.mime_id
+            join mimes mb on tbm.mime_id_b=mb.mime_id
+            limit 100000
+        </sql>
+
+    </report>
+    <report reportName="Tag Exceptions By Mime"
+            reportFilename="tags/tag_exceptions_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select ma.mime_string as mime_string_a,
+            mb.mime_string as mime_string_b,
+            tag_exceptions_a,
+            tag_exceptions_b,
+            (tag_exceptions_b-tag_exceptions_a) as diff_tag_exceptions_in_b
+            from tag_exceptions_by_mime tebm
+            join mimes ma on tebm.mime_id_a=ma.mime_id
+            join mimes mb on tebm.mime_id_b=mb.mime_id
+            order by diff_tag_exceptions_in_b desc
+        </sql>
+    </report>
+    <report reportName="Tag Exceptions Details A"
+                         reportFilename="tags/tag_exceptions_details_a.xlsx"
+                         format="xlsx"
+                         includeSql="true">
+        <sql>
+            select c.file_path,pa.file_name,mime_string,is_embedded from
+            tags_a ta
+            join profiles_a pa on ta.id=pa.id
+            join containers c on pa.container_id=c.container_id
+            join mimes m on pa.mime_id=m.mime_id
+            where ta.tags_parse_exception=true
+            order by m.mime_string
+            limit 20000
+        </sql>
+    </report>
+    <report reportName="Tag Exceptions Details B"
+            reportFilename="tags/tag_exceptions_details_b.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select c.file_path,pb.file_name,mime_string,is_embedded from
+            tags_b tb
+            join profiles_b pb on tb.id=pb.id
+            join containers c on pb.container_id=c.container_id
+            join mimes m on pb.mime_id=m.mime_id
+            where tb.tags_parse_exception=true
+            order by m.mime_string
+            limit 20000
+        </sql>
+    </report>
+
+    <report reportName="Parse Time (Millis) Compared"
+            reportFilename="parse_times/parse_time_millis_by_mime_compared.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select ma.mime_string as MIME_STRING_A,
+            mb.mime_string as MIME_STRING_B,
+            total_a as TOTAL_MILLIS_A, total_b as TOTAL_MILLIS_B,
+            prcnt_increase as PERCENT_INCREASE
+            from parse_time_compared ptc
+            join mimes ma on ptc.mime_id_a=ma.mime_id
+            join mimes mb on ptc.mime_id_b=mb.mime_id
+            where TOTAL_A &gt; 1000 AND TOTAL_B &gt; 1000 -- only show comparisons if &gt; a second
+            order by prcnt_increase desc
+        </sql>
+    </report>
+    <report reportName="Parse Time (Millis) Details"
+            reportFilename="parse_times/parse_time_millis_details.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select file_path, c.length as CONTAINTER_LENGTH,
+            ma.mime_string as MIME_STRING_A,
+            mb.mime_string as MIME_STRING_B,
+            pa.elapsed_time_millis as TOTAL_MILLIS_A,
+            pb.elapsed_time_millis as TOTAL_MILLIS_B,
+            (pb.elapsed_time_millis-pa.elapsed_time_millis) as DIFF_MILLIS
+            from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            join containers c on pa.container_id=c.container_id
+            order by DIFF_MILLIS desc
+            limit 20000;
+        </sql>
+    </report>
+    <after>
+        <sql>drop table if exists md5_multiples_tmp_a</sql>
+        <sql>drop table if exists md5_multiples_tmp_b</sql>
+    </after>
+</reports>
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
new file mode 100644
index 0000000..e23ec5e
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
@@ -0,0 +1,1753 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<reports>
+
+
+    <before>
+
+        <sql>drop table if exists md5_multiples_tmp_a</sql>
+        <sql>create table md5_multiples_tmp_a (MD5 char(32), cnt int)
+            as
+            select md5, count(1) cnt
+            from profiles_a
+            where md5 is not null
+            group by md5
+            having cnt &gt; 1
+            order by cnt desc
+        </sql>
+
+        <sql>drop table if exists md5_multiples_tmp_b</sql>
+        <sql>create table md5_multiples_tmp_b (MD5 char(32), cnt int)
+            as
+            select md5, count(1) cnt
+            from profiles_b
+            where md5 is not null
+            group by md5
+            having cnt &gt; 1
+            order by cnt desc
+        </sql>
+        <!-- build mime indexes -->
+
+        <sql>create index if not exists pa_m_idx
+            on profiles_a (mime_id);
+        </sql>
+
+        <sql>
+            create index if not exists pb_m_idx
+            on profiles_b (mime_id);
+        </sql>
+
+        <!-- build exceptions comparison table -->
+        <sql>drop table if exists exceptions_compared</sql>
+        <sql>
+            create table exceptions_compared (
+            mime_id_a integer,
+            mime_id_b integer,
+            total integer,
+            exc_cnt_a integer,
+            exc_cnt_b integer,
+            exc_prcnt_a float,
+            exc_prcnt_b float,
+            notes varchar(12)
+            );
+        </sql>
+        <sql>
+            insert into exceptions_compared (
+            select ma.mime_id, mb.mime_id, count(1) as total, 0, 0, 0.0, 0.0, ''
+            from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join mimes ma on pa.mime_id = ma.mime_id
+            join mimes mb on pb.mime_id = mb.mime_id
+            group by ma.mime_id, mb.mime_id
+            order by total desc );
+        </sql>
+
+        <sql>
+            update exceptions_compared ec set
+            exc_cnt_a = (
+            select count(1) as cnt
+            from exceptions_a ea
+            join profiles_a pa on ea.id=pa.id
+            join profiles_b pb on pb.id=pa.id
+            join mimes ma on pa.mime_id=ma.mime_id
+            join mimes mb on pb.mime_id=mb.mime_id
+            where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+            group by ma.mime_id, mb.mime_id);
+        </sql>
+        <sql>
+            update exceptions_compared ec set
+            exc_cnt_b = (
+            select count(1) as cnt
+            from exceptions_b eb
+            join profiles_b pb on eb.id=pb.id
+            join profiles_a pa on pa.id=pb.id
+            join mimes ma on pa.mime_id=ma.mime_id
+            join mimes mb on pb.mime_id=mb.mime_id
+            where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+            group by mb.mime_id, ma.mime_id);
+        </sql>
+        <sql>
+            update exceptions_compared
+            set exc_prcnt_a = cast(exc_cnt_a as decimal)/cast(total as decimal)
+            where total > 0;
+        </sql>
+        <sql>
+            update exceptions_compared
+            set exc_prcnt_b = cast(exc_cnt_b as decimal)/cast(total as decimal)
+            where total > 0;
+        </sql>
+
+        <sql>
+            update exceptions_compared
+            set notes = 'YAY!'
+            where total > 100 and (exc_prcnt_a-exc_prcnt_b) > 0.10;
+        </sql>
+        <sql>
+            update exceptions_compared
+            set notes = 'YIKES!'
+            where total > 100 and (exc_prcnt_b-exc_prcnt_a) > 0.10;
+        </sql>
+
+        <!-- build tmp common words table -->
+        <sql>drop table if exists token_counts_compared</sql>
+        <sql>
+            create table token_counts_compared
+            (mime_id_a integer,
+            mime_id_b integer,
+            num_tokens_a long default 0,
+            num_tokens_b long default 0,
+            num_alphabetic_tokens_a long default 0,
+            num_alphabetic_tokens_b long default 0,
+            num_common_tokens_a long default 0,
+            num_common_tokens_b long default 0
+            );
+        </sql>
+        <sql>
+            insert into token_counts_compared (mime_id_a, mime_id_b)
+            select ma.mime_id, mb.mime_id
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            group by ma.mime_id, mb.mime_id
+
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_tokens_a=(
+            select sum(num_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_tokens_b=(
+            select sum(num_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pa.id=pb.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_alphabetic_tokens_a=(
+            select sum(num_alphabetic_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_alphabetic_tokens_b=(
+            select sum(num_alphabetic_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pb.id=pa.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_common_tokens_a=(
+            select sum(num_common_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>
+            update token_counts_compared tcc set num_common_tokens_b=(
+            select sum(num_common_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pa.id=pb.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_id= tcc.mime_id_b
+            and pa.mime_id=tcc.mime_id_a
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+
+        <sql>drop table if exists tags_by_mime</sql>
+        <sql>create table tags_by_mime (
+                mime_id_a integer,
+                mime_id_b integer,
+                tags_a_a integer,
+                tags_b_a integer,
+                tags_div_a integer,
+                tags_i_a integer,
+                tags_img_a integer,
+                tags_li_a integer,
+                tags_ol_a integer,
+                tags_p_a integer,
+                tags_table_a integer,
+                tags_td_a integer,
+                tags_title_a integer,
+                tags_tr_a integer,
+                tags_u_a integer,
+                tags_ul_a integer,
+                tags_a_b integer,
+                tags_b_b integer,
+                tags_div_b integer,
+                tags_i_b integer,
+                tags_img_b integer,
+                tags_li_b integer,
+                tags_ol_b integer,
+                tags_p_b integer,
+                tags_table_b integer,
+                tags_td_b integer,
+                tags_title_b integer,
+                tags_tr_b integer,
+                tags_u_b integer,
+                tags_ul_b integer
+            );
+        </sql>
+        <sql>
+            insert into tags_by_mime (mime_id_a, mime_id_b)
+            select ma.mime_id, mb.mime_id
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            group by ma.mime_id, mb.mime_id
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_a_a=(
+            select sum(ta.tags_a) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_b_a=(
+            select sum(ta.tags_b) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_div_a=(
+            select sum(ta.tags_div) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_i_a=(
+            select sum(ta.tags_i) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_img_a=(
+            select sum(ta.tags_img) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_li_a=(
+            select sum(ta.tags_li) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ol_a=(
+            select sum(ta.tags_ol) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_p_a=(
+            select sum(ta.tags_p) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_table_a=(
+            select sum(ta.tags_table) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_td_a=(
+            select sum(ta.tags_td) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_title_a=(
+            select sum(ta.tags_title) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_tr_a=(
+            select sum(ta.tags_tr) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_u_a=(
+            select sum(ta.tags_u) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ul_a=(
+            select sum(ta.tags_ul) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <!-- now update tags_b counts -->
+        <sql>
+            update tags_by_mime tbm set tags_a_b=(
+            select sum(tb.tags_a) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_b_b=(
+            select sum(tb.tags_b) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_div_b=(
+            select sum(tb.tags_div) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_i_b=(
+            select sum(tb.tags_i) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_img_b=(
+            select sum(tb.tags_img) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_li_b=(
+            select sum(tb.tags_li) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ol_b=(
+            select sum(tb.tags_ol) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_p_b=(
+            select sum(tb.tags_p) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_table_b=(
+            select sum(tb.tags_table) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_td_b=(
+            select sum(tb.tags_td) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_title_b=(
+            select sum(tb.tags_title) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_tr_b=(
+            select sum(tb.tags_tr) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_u_b=(
+            select sum(tb.tags_u) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ul_b=(
+            select sum(tb.tags_ul) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>drop table if exists tag_exceptions_by_mime</sql>
+        <sql>create table tag_exceptions_by_mime (
+            mime_id_a integer,
+            mime_id_b integer,
+            tag_exceptions_a integer,
+            tag_exceptions_b integer)
+        </sql>
+        <sql>
+            insert into tag_exceptions_by_mime (mime_id_a, mime_id_b,
+                tag_exceptions_a, tag_exceptions_b)
+            select ma.mime_id, mb.mime_id,0,0
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            group by ma.mime_id, mb.mime_id
+        </sql>
+        <sql>
+            update tag_exceptions_by_mime tebm set tag_exceptions_a=(
+            select count(1) as cnt from tags_a ta
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tebm.mime_id_b
+            and pa.mime_id=tebm.mime_id_a
+            and ta.tags_parse_exception=true
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tag_exceptions_by_mime tebm set tag_exceptions_b=(
+            select count(1) as cnt from tags_b tb
+            join profiles_a pa on pa.id=tb.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tebm.mime_id_b
+            and pa.mime_id=tebm.mime_id_a
+            and tb.tags_parse_exception=true
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            drop table if exists parse_time_compared;
+        </sql>
+        <sql>
+            create table parse_time_compared (
+            mime_id_a integer,
+            mime_id_b integer,
+            total_a bigint,
+            total_b bigint,
+            prcnt_increase double
+            );
+        </sql>
+            <sql>
+                insert into parse_time_compared (mime_id_a, mime_id_b,
+                total_a, total_b, prcnt_increase)
+                select ma.mime_id, mb.mime_id,0,0,0.0
+                from profiles_a a
+                join profiles_b b on a.id=b.id
+                join mimes ma on ma.mime_id=a.mime_id
+                join mimes mb on mb.mime_id=b.mime_id
+                group by ma.mime_id, mb.mime_id
+            </sql>
+        <sql>
+            update parse_time_compared ptc set total_a=(
+            select sum(pa.elapsed_time_millis) as total_a from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            where pa.mime_id= ptc.mime_id_a
+            and pb.mime_id=ptc.mime_id_b
+            group by mime_id_a, mime_id_b)
+        </sql>
+        <sql>
+            update parse_time_compared ptc set total_b=(
+            select sum(pb.elapsed_time_millis) as total_b from profiles_b pb
+            join profiles_a pa on pa.id=pb.id
+            where pa.mime_id= ptc.mime_id_a
+            and pb.mime_id=ptc.mime_id_b
+            group by mime_id_a, mime_id_b)
+        </sql>
+        <sql>
+            update parse_time_compared ptc set prcnt_increase=(100.0 *
+            cast(total_b as decimal)/cast(total_a as decimal))
+            where total_a > 0;
+        </sql>
+    </before>
+
+    <!-- MIMES -->
+    <report reportName="All Mimes In A"
+            reportFilename="mimes/all_mimes_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_a p
+            join mimes m on m.mime_id = p.mime_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="All Mimes In B"
+            reportFilename="mimes/all_mimes_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_b p
+            join mimes m on m.mime_id = p.mime_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Container Mimes In A"
+            reportFilename="mimes/container_mimes_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_a p
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=false
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="Container Mimes In B"
+            reportFilename="mimes/container_mimes_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_b p
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=false
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Embedded Mimes In A"
+            reportFilename="mimes/embedded_mimes_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_a p
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=true
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="Embedded Mimes In B"
+            reportFilename="mimes/embedded_mimes_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_b p
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=true
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Mime Differences A -> B"
+            reportFilename="mimes/mime_diffs_A_to_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select concat(ma.mime_string, ' -&gt; ', mb.mime_string) as
+            MIME_A_TO_MIME_B, count(1) as COUNT
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            where a.mime_id &lt;&gt; b.mime_id
+            group by MIME_A_TO_MIME_B
+            order by COUNT DESC
+        </sql>
+    </report>
+
+    <report reportName="Mime Differences A -> B Details"
+            reportFilename="mimes/mime_diffs_A_to_B_details.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select concat(ma.mime_string, ' -&gt; ', mb.mime_string) as
+            MIME_A_TO_MIME_B,
+            file_path,
+            c.length as CONTAINER_LENGTH,
+            a.file_name
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            join containers c on a.container_id=c.container_id
+            where a.mime_id &lt;&gt; b.mime_id
+            order by MIME_A_TO_MIME_B
+        </sql>
+    </report>
+
+
+    <!-- Exceptions -->
+    <report reportName="AllExceptionsByMimeA"
+            reportFilename="exceptions/exceptions_by_mime_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_a e
+            join profiles_a p on p.id=e.id
+            join mimes m on m.mime_id = p.mime_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="AllExceptionsByMimeB"
+            reportFilename="exceptions/exceptions_by_mime_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_b e
+            join profiles_b p on p.id=e.id
+            join mimes m on m.mime_id = p.mime_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="ContainerExceptionsByMimeA"
+            reportFilename="exceptions/container_exceptions_by_mime_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_a e
+            join profiles_a p on p.id=e.id
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=false
+            and parse_exception_id=0
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="ContainerExceptionsByMimeB"
+            reportFilename="exceptions/container_exceptions_by_mime_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_b e
+            join profiles_b p on p.id=e.id
+            join mimes m on m.mime_id = p.mime_id
+            where is_embedded=false
+            and parse_exception_id=0
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="AllExceptionsByMimeByTypeA"
+            reportFilename="exceptions/exceptions_by_mime_by_type_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string as MIME_TYPE,
+            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+            from exceptions_a e
+            join profiles_a p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_id=p.mime_id
+            join ref_parse_exception_types r on
+            r.parse_exception_id=e.parse_exception_id
+            group by p.mime_id, parse_exception_description
+            order by MIME_TYPE, EXCEPTION_TYPE
+        </sql>
+    </report>
+
+    <report reportName="AllExceptionsByMimeByTypeB"
+            reportFilename="exceptions/exceptions_by_mime_by_type_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string as MIME_TYPE,
+            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+            from exceptions_b e
+            join profiles_b p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_id=p.mime_id
+            join ref_parse_exception_types r on
+            r.parse_exception_id=e.parse_exception_id
+            group by p.mime_id, parse_exception_description
+            order by MIME_TYPE, EXCEPTION_TYPE
+        </sql>
+    </report>
+
+    <report reportName="TextLostFromACausedByNewExceptionsInB"
+            reportFilename="exceptions/text_lost_from_A_caused_by_new_exceptions_in_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path as FILE_PATH,
+            c.length as CONTAINER_LENGTH,
+            ca.NUM_TOKENS as NUM_TOKENS_A,
+            cb.NUM_TOKENS as NUM_TOKENS_B,
+            ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A, cb.NUM_UNIQUE_TOKENS
+            as NUM_UNIQUE_TOKENS_B,
+            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
+            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
+            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
+            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+            ca.top_n_tokens as TOP_N_TOKENS_A, cb.top_n_tokens as TOP_N_TOKENS_B,
+            eb.ORIG_STACK_TRACE as ORIG_STACK_TRACE_B
+            from contents_a ca
+            join profiles_a pa on ca.id = pa.id
+            join containers c on pa.container_id=c.container_id
+            left join contents_b cb on ca.id=cb.id
+            left join exceptions_b eb on ca.id = eb.id
+            left join exceptions_a ea on ca.id = ea.id
+            where eb.orig_stack_trace is not null
+            and ea.orig_stack_trace is null
+            order by ca.num_common_tokens - ifnull(cb.num_common_tokens,0) desc
+        </sql>
+    </report>
+
+    <report reportName="FixedExceptionsInBByMimeType"
+            reportFilename="exceptions/fixed_exceptions_in_B_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select
+            ma.mime_string as MIME_TYPE_A,
+            mb.mime_string as MIME_TYPE_B,
+            count(1) as COUNT
+            from exceptions_a ea
+            left join exceptions_b eb on ea.id = eb.id
+            join profiles_a pa on pa.id=ea.id
+            join profiles_b pb on pa.id=pb.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            where eb.id is null
+            and ea.parse_exception_id=0
+            group by mime_type_a, mime_type_b
+        </sql>
+    </report>
+
+    <report reportName="FixedExceptionsInByDetails"
+            reportFilename="exceptions/fixed_exceptions_in_B_details.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select
+            file_path,
+            c.length as CONTAINER_LENGTH,
+            ma.mime_string as MIME_TYPE_A,
+            mb.mime_string as MIME_TYPE_B,
+            pa.file_name, pa.is_embedded
+            from exceptions_a ea
+            left join exceptions_b eb on ea.id = eb.id
+            join profiles_a pa on pa.id=ea.id
+            join profiles_b pb on pb.id=pa.id //this ensures that files were actually processed in both runs
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            where eb.id is null
+            and ea.parse_exception_id=0
+            order by mime_type_a, mime_type_b
+        </sql>
+    </report>
+    <report reportName="ContentsOfFixedExceptionsInB"
+            reportFilename="exceptions/contents_of_fixed_exceptions_in_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            c.length as CONTAINER_LENGTH,
+            ma.mime_string as MIME_TYPE_A,
+            mb.mime_string as MIME_TYPE_B,
+            CONTENT_LENGTH,
+            NUM_TOKENS, NUM_UNIQUE_TOKENS,
+            TOP_N_TOKENS, LANG_ID_1,TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV
+            from exceptions_a ea
+            left join exceptions_b eb on ea.id = eb.id
+            join profiles_a pa on pa.id=ea.id
+            join profiles_b pb on pa.id=pb.id
+            join contents_b cb on cb.id=ea.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            where eb.id is null
+            and ea.parse_exception_id=0
+        </sql>
+    </report>
+
+    <report reportName="NewExceptionsByMimeType"
+            reportFilename="exceptions/new_exceptions_in_B_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select ma.mime_string as MIME_TYPE_A, mb.mime_string as MIME_TYPE_B, count(1) as COUNT
+            from exceptions_b eb
+            left join exceptions_a ea on ea.id = eb.id
+            join profiles_a pa on pa.id=eb.id
+            join profiles_b pb on pb.id=pa.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on ma.mime_id=pa.mime_id
+            join mimes mb on mb.mime_id=pb.mime_id
+            where ea.id is null
+            and eb.parse_exception_id=0
+            group by ma.mime_string, mb.mime_string
+            order by COUNT desc
+        </sql>
+    </report>
+
+    <report reportName="NewExceptionsInBByMimeTypeByStackTrace"
+            reportFilename="exceptions/new_exceptions_in_B_by_mime_by_stack_trace.xlsx"
+            format="xlsx"
+            includeSql="true">
... 15475 lines suppressed ...