You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/11/30 19:45:18 UTC

[tika] branch main updated: TIKA-3240 -- break tika-eval into tika-eval-core and tika-eval-app for Tika 2.0.0

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 219338f  TIKA-3240 -- break tika-eval into tika-eval-core and tika-eval-app for Tika 2.0.0
219338f is described below

commit 219338f0925ef55a4748906e14a757a05b94ce35
Author: tallison <ta...@apache.org>
AuthorDate: Mon Nov 30 14:44:46 2020 -0500

    TIKA-3240 -- break tika-eval into tika-eval-core and tika-eval-app for Tika 2.0.0
---
 tika-eval/pom.xml                                  |  218 +--
 .../org/apache/tika/eval/AbstractProfiler.java     |  825 ---------
 .../java/org/apache/tika/eval/EvalFilePaths.java   |  118 --
 .../java/org/apache/tika/eval/ExtractComparer.java |  538 ------
 .../java/org/apache/tika/eval/ExtractProfiler.java |  281 ----
 .../java/org/apache/tika/eval/FileProfiler.java    |  195 ---
 .../java/org/apache/tika/eval/TikaEvalCLI.java     |  326 ----
 .../org/apache/tika/eval/XMLErrorLogUpdater.java   |  223 ---
 .../apache/tika/eval/batch/DBConsumersManager.java |  103 --
 .../tika/eval/batch/EvalConsumerBuilder.java       |  240 ---
 .../tika/eval/batch/EvalConsumersBuilder.java      |  131 --
 .../tika/eval/batch/ExtractComparerBuilder.java    |  165 --
 .../tika/eval/batch/ExtractProfilerBuilder.java    |  122 --
 .../tika/eval/batch/FileProfilerBuilder.java       |   97 --
 .../org/apache/tika/eval/db/AbstractDBBuffer.java  |   77 -
 .../main/java/org/apache/tika/eval/db/ColInfo.java |  127 --
 .../main/java/org/apache/tika/eval/db/Cols.java    |  117 --
 .../java/org/apache/tika/eval/db/DBBuffer.java     |   54 -
 .../main/java/org/apache/tika/eval/db/H2Util.java  |   89 -
 .../java/org/apache/tika/eval/db/JDBCUtil.java     |  316 ----
 .../java/org/apache/tika/eval/db/MimeBuffer.java   |  148 --
 .../java/org/apache/tika/eval/db/TableInfo.java    |   73 -
 .../java/org/apache/tika/eval/io/DBWriter.java     |  162 --
 .../org/apache/tika/eval/io/ExtractReader.java     |  243 ---
 .../tika/eval/io/ExtractReaderException.java       |   48 -
 .../java/org/apache/tika/eval/io/IDBWriter.java    |   31 -
 .../org/apache/tika/eval/io/XMLLogMsgHandler.java  |   26 -
 .../java/org/apache/tika/eval/io/XMLLogReader.java |  115 --
 .../java/org/apache/tika/eval/reports/Report.java  |  198 ---
 .../apache/tika/eval/reports/ResultsReporter.java  |  313 ----
 .../tika/eval/reports/XLSXHREFFormatter.java       |   80 -
 .../apache/tika/eval/reports/XLSXNumFormatter.java |   54 -
 .../tika/eval/reports/XSLXCellFormatter.java       |   30 -
 .../eval/tools/BatchTopCommonTokenCounter.java     |   54 -
 .../tika/eval/tools/CommonTokenOverlapCounter.java |   69 -
 .../org/apache/tika/eval/tools/LeipzigHelper.java  |   57 -
 .../org/apache/tika/eval/tools/LeipzigSampler.java |   76 -
 .../eval/tools/SlowCompositeReaderWrapper.java     |  327 ----
 .../tika/eval/tools/TopCommonTokenCounter.java     |  401 -----
 .../org/apache/tika/eval/tools/TrainTestSplit.java |  114 --
 .../src/main/resources/comparison-reports-pg.xml   | 1753 --------------------
 .../src/main/resources/comparison-reports.xml      | 1753 --------------------
 tika-eval/src/main/resources/db.properties         |   23 -
 tika-eval/src/main/resources/profile-reports.xml   |  329 ----
 .../main/resources/tika-eval-comparison-config.xml |  102 --
 .../resources/tika-eval-file-profiler-config.xml   |   74 -
 .../main/resources/tika-eval-profiler-config.xml   |   96 --
 .../test/java/org/apache/tika/MockDBWriter.java    |   82 -
 .../org/apache/tika/eval/AnalyzerManagerTest.java  |  100 --
 .../org/apache/tika/eval/ComparerBatchTest.java    |  399 -----
 .../org/apache/tika/eval/ProfilerBatchTest.java    |  235 ---
 .../org/apache/tika/eval/SimpleComparerTest.java   |  442 -----
 .../java/org/apache/tika/eval/TikaEvalCLITest.java |  294 ----
 .../apache/tika/eval/db/AbstractBufferTest.java    |  160 --
 .../org/apache/tika/eval/io/ExtractReaderTest.java |   86 -
 .../tika/eval/io/FatalExceptionReaderTest.java     |   33 -
 .../tika/eval/reports/ResultsReporterTest.java     |   60 -
 .../tika/tools/TopCommonTokenCounterTest.java      |   96 --
 tika-eval/src/test/resources/common_tokens/en      |   28 -
 tika-eval/src/test/resources/common_tokens/es      |   30 -
 tika-eval/src/test/resources/common_tokens/zh-cn   |   28 -
 tika-eval/src/test/resources/common_tokens/zh-tw   |   28 -
 .../single-file-profiler-crawl-extract-config.xml  |   76 -
 .../single-file-profiler-crawl-input-config.xml    |   77 -
 .../test-dirs/batch-logs/batch-process-fatal.xml   |   59 -
 .../resources/test-dirs/extractsA/file1.pdf.json   |    5 -
 .../test-dirs/extractsA/file10_permahang.txt.json  |    0
 .../test-dirs/extractsA/file11_oom.txt.json        |    0
 .../test-dirs/extractsA/file12_es.txt.json         |    4 -
 .../extractsA/file13_attachANotB.doc.json          |   11 -
 .../extractsA/file14_diffAttachOrder.json          |   19 -
 .../resources/test-dirs/extractsA/file15_tags.json |   41 -
 .../test-dirs/extractsA/file16_badTags.json        |   41 -
 .../test-dirs/extractsA/file17_tagsOutOfOrder.json |   41 -
 .../test-dirs/extractsA/file2_attachANotB.doc.json |   10 -
 .../test-dirs/extractsA/file3_attachBNotA.doc.json |    4 -
 .../test-dirs/extractsA/file4_emptyB.pdf.json      |    4 -
 .../test-dirs/extractsA/file5_emptyA.pdf.json      |    0
 .../test-dirs/extractsA/file6_accessEx.pdf.json    |    1 -
 .../test-dirs/extractsA/file7_badJson.pdf.json     |    4 -
 .../test-dirs/extractsA/file8_IOEx.pdf.json        |    1 -
 .../resources/test-dirs/extractsB/file1.pdf.json   |    2 -
 .../test-dirs/extractsB/file11_oom.txt.json        |    0
 .../test-dirs/extractsB/file12_es.txt.json         |    4 -
 .../test-dirs/extractsB/file13_attachANotB.doc.txt |    1 -
 .../extractsB/file14_diffAttachOrder.json          |   19 -
 .../resources/test-dirs/extractsB/file15_tags.html |   31 -
 .../test-dirs/extractsB/file16_badTags.html        |   31 -
 .../test-dirs/extractsB/file2_attachANotB.doc.json |    4 -
 .../test-dirs/extractsB/file3_attachBNotA.doc.json |   10 -
 .../test-dirs/extractsB/file4_emptyB.pdf.json      |    0
 .../test-dirs/extractsB/file5_emptyA.pdf.json      |    4 -
 .../test-dirs/extractsB/file6_accessEx.pdf.json    |    1 -
 .../test-dirs/extractsB/file7_badJson.pdf.json     |    0
 .../test-dirs/extractsB/file8_IOEx.pdf.json        |    1 -
 .../test/resources/test-dirs/raw_input/file1.pdf   |   13 -
 .../resources/test-dirs/raw_input/file11_oom.txt   |    2 -
 .../test-dirs/raw_input/file2_attachANotB.doc      |   13 -
 .../test-dirs/raw_input/file3_attachBNotA.doc      |   13 -
 .../resources/test-dirs/raw_input/file4_emptyB.pdf |   13 -
 .../resources/test-dirs/raw_input/file5_emptyA.pdf |   13 -
 .../test-dirs/raw_input/file6_accessEx.pdf         |   13 -
 .../test-dirs/raw_input/file7_badJson.pdf          |   13 -
 .../resources/test-dirs/raw_input/file8_IOEx.pdf   |   13 -
 .../test-dirs/raw_input/file9_noextract.txt        |    1 -
 tika-eval/{ => tika-eval-app}/pom.xml              |  121 +-
 tika-eval/{ => tika-eval-core}/pom.xml             |  109 +-
 .../tika/eval/core}/langid/LanguageIDWrapper.java  |    4 +-
 .../core}/metadata/TikaEvalMetadataFilter.java     |   16 +-
 .../textstats/BasicTokenCountStatsCalculator.java  |    4 +-
 .../eval/core}/textstats/BytesRefCalculator.java   |    2 +-
 .../tika/eval/core}/textstats/CommonTokens.java    |   12 +-
 .../core}/textstats/CommonTokensBhattacharyya.java |    8 +-
 .../eval/core}/textstats/CommonTokensCosine.java   |    8 +-
 .../core}/textstats/CommonTokensHellinger.java     |    8 +-
 .../core}/textstats/CommonTokensKLDNormed.java     |    8 +-
 .../core}/textstats/CommonTokensKLDivergence.java  |    8 +-
 .../textstats/CompositeTextStatsCalculator.java    |    8 +-
 .../core}/textstats/ContentLengthCalculator.java   |    2 +-
 .../textstats/LanguageAwareTokenCountStats.java    |    4 +-
 .../core}/textstats/StringStatsCalculator.java     |    2 +-
 .../eval/core}/textstats/TextProfileSignature.java |    5 +-
 .../eval/core}/textstats/TextSha256Signature.java  |    2 +-
 .../eval/core}/textstats/TextStatsCalculator.java  |    2 +-
 .../core}/textstats/TokenCountPriorityQueue.java   |    4 +-
 .../core}/textstats/TokenCountStatsCalculator.java |    4 +-
 .../tika/eval/core}/textstats/TokenEntropy.java    |    4 +-
 .../tika/eval/core}/textstats/TokenLengths.java    |    4 +-
 .../tika/eval/core}/textstats/TopNTokens.java      |    6 +-
 .../eval/core}/textstats/UnicodeBlockCounter.java  |    5 +-
 .../core}/tokens/AlphaIdeographFilterFactory.java  |    2 +-
 .../eval/core}/tokens/AnalyzerDeserializer.java    |    2 +-
 .../tika/eval/core}/tokens/AnalyzerManager.java    |    3 +-
 .../tokens/CJKBigramAwareLengthFilterFactory.java  |    2 +-
 .../eval/core}/tokens/CommonTokenCountManager.java |    4 +-
 .../tika/eval/core}/tokens/CommonTokenResult.java  |    2 +-
 .../tika/eval/core}/tokens/ContrastStatistics.java |    2 +-
 .../apache/tika/eval/core}/tokens/LangModel.java   |    2 +-
 .../tika/eval/core}/tokens/TokenContraster.java    |    3 +-
 .../eval/core}/tokens/TokenCountPriorityQueue.java |    2 +-
 .../tika/eval/core}/tokens/TokenCounter.java       |   17 +-
 .../apache/tika/eval/core}/tokens/TokenCounts.java |    2 +-
 .../tika/eval/core}/tokens/TokenIntPair.java       |    2 +-
 .../tika/eval/core}/tokens/TokenStatistics.java    |    2 +-
 .../tokens/URLEmailNormalizingFilterFactory.java   |    2 +-
 .../tika/eval/core}/util/ContentTagParser.java     |    2 +-
 .../apache/tika/eval/core}/util/ContentTags.java   |    2 +-
 .../tika/eval/core}/util/EvalExceptionUtils.java   |    2 +-
 ....apache.lucene.analysis.util.TokenFilterFactory |    6 +-
 .../org.apache.tika.metadata.filter.MetadataFilter |    2 +-
 .../src/main/resources/common_tokens/afr           |    0
 .../src/main/resources/common_tokens/amh           |    0
 .../src/main/resources/common_tokens/ara           |    0
 .../src/main/resources/common_tokens/asm           |    0
 .../src/main/resources/common_tokens/ast           |    0
 .../src/main/resources/common_tokens/aze           |    0
 .../src/main/resources/common_tokens/azj           |    0
 .../src/main/resources/common_tokens/bak           |    0
 .../src/main/resources/common_tokens/ban           |    0
 .../src/main/resources/common_tokens/bel           |    0
 .../src/main/resources/common_tokens/ben           |    0
 .../src/main/resources/common_tokens/bos           |    0
 .../src/main/resources/common_tokens/bre           |    0
 .../src/main/resources/common_tokens/bul           |    0
 .../src/main/resources/common_tokens/cat           |    0
 .../src/main/resources/common_tokens/ceb           |    0
 .../src/main/resources/common_tokens/ces           |    0
 .../src/main/resources/common_tokens/che           |    0
 .../src/main/resources/common_tokens/ckb           |    0
 .../src/main/resources/common_tokens/cmn           |    0
 .../src/main/resources/common_tokens/cym           |    0
 .../src/main/resources/common_tokens/dan           |    0
 .../src/main/resources/common_tokens/deu           |    0
 .../src/main/resources/common_tokens/div           |    0
 .../src/main/resources/common_tokens/ekk           |    0
 .../src/main/resources/common_tokens/ell           |    0
 .../src/main/resources/common_tokens/eng           |    0
 .../src/main/resources/common_tokens/epo           |    0
 .../src/main/resources/common_tokens/est           |    0
 .../src/main/resources/common_tokens/eus           |    0
 .../src/main/resources/common_tokens/fao           |    0
 .../src/main/resources/common_tokens/fas           |    0
 .../src/main/resources/common_tokens/fin           |    0
 .../src/main/resources/common_tokens/fra           |    0
 .../src/main/resources/common_tokens/fry           |    0
 .../src/main/resources/common_tokens/gle           |    0
 .../src/main/resources/common_tokens/glg           |    0
 .../src/main/resources/common_tokens/gsw           |    0
 .../src/main/resources/common_tokens/guj           |    0
 .../src/main/resources/common_tokens/hat           |    0
 .../src/main/resources/common_tokens/heb           |    0
 .../src/main/resources/common_tokens/hin           |    0
 .../src/main/resources/common_tokens/hrv           |    0
 .../src/main/resources/common_tokens/hun           |    0
 .../src/main/resources/common_tokens/hye           |    0
 .../src/main/resources/common_tokens/ind           |    0
 .../src/main/resources/common_tokens/isl           |    0
 .../src/main/resources/common_tokens/ita           |    0
 .../src/main/resources/common_tokens/jav           |    0
 .../src/main/resources/common_tokens/jpn           |    0
 .../src/main/resources/common_tokens/kan           |    0
 .../src/main/resources/common_tokens/kat           |    0
 .../src/main/resources/common_tokens/kaz           |    0
 .../src/main/resources/common_tokens/kin           |    0
 .../src/main/resources/common_tokens/kir           |    0
 .../src/main/resources/common_tokens/kor           |    0
 .../src/main/resources/common_tokens/kur           |    0
 .../src/main/resources/common_tokens/lat           |    0
 .../src/main/resources/common_tokens/lav           |    0
 .../src/main/resources/common_tokens/lim           |    0
 .../src/main/resources/common_tokens/lit           |    0
 .../src/main/resources/common_tokens/ltz           |    0
 .../src/main/resources/common_tokens/lug           |    0
 .../src/main/resources/common_tokens/lvs           |    0
 .../src/main/resources/common_tokens/mal           |    0
 .../src/main/resources/common_tokens/mar           |    0
 .../src/main/resources/common_tokens/mhr           |    0
 .../src/main/resources/common_tokens/min           |    0
 .../src/main/resources/common_tokens/mkd           |    0
 .../src/main/resources/common_tokens/mlg           |    0
 .../src/main/resources/common_tokens/mlt           |    0
 .../src/main/resources/common_tokens/mon           |    0
 .../src/main/resources/common_tokens/mri           |    0
 .../src/main/resources/common_tokens/msa           |    0
 .../src/main/resources/common_tokens/nan           |    0
 .../src/main/resources/common_tokens/nds           |    0
 .../src/main/resources/common_tokens/nep           |    0
 .../src/main/resources/common_tokens/nld           |    0
 .../src/main/resources/common_tokens/nno           |    0
 .../src/main/resources/common_tokens/nob           |    0
 .../src/main/resources/common_tokens/oci           |    0
 .../src/main/resources/common_tokens/ori           |    0
 .../src/main/resources/common_tokens/pan           |    0
 .../src/main/resources/common_tokens/pes           |    0
 .../src/main/resources/common_tokens/plt           |    0
 .../src/main/resources/common_tokens/pnb           |    0
 .../src/main/resources/common_tokens/pol           |    0
 .../src/main/resources/common_tokens/por           |    0
 .../src/main/resources/common_tokens/pus           |    0
 .../src/main/resources/common_tokens/ron           |    0
 .../src/main/resources/common_tokens/rus           |    0
 .../src/main/resources/common_tokens/san           |    0
 .../src/main/resources/common_tokens/sin           |    0
 .../src/main/resources/common_tokens/slk           |    0
 .../src/main/resources/common_tokens/slv           |    0
 .../src/main/resources/common_tokens/snd           |    0
 .../src/main/resources/common_tokens/som           |    0
 .../src/main/resources/common_tokens/spa           |    0
 .../src/main/resources/common_tokens/sqi           |    0
 .../src/main/resources/common_tokens/srp           |    0
 .../src/main/resources/common_tokens/sun           |    0
 .../src/main/resources/common_tokens/swa           |    0
 .../src/main/resources/common_tokens/swe           |    0
 .../src/main/resources/common_tokens/tam           |    0
 .../src/main/resources/common_tokens/tat           |    0
 .../src/main/resources/common_tokens/tel           |    0
 .../src/main/resources/common_tokens/tgk           |    0
 .../src/main/resources/common_tokens/tgl           |    0
 .../src/main/resources/common_tokens/tha           |    0
 .../src/main/resources/common_tokens/tuk           |    0
 .../src/main/resources/common_tokens/tur           |    0
 .../src/main/resources/common_tokens/uig           |    0
 .../src/main/resources/common_tokens/ukr           |    0
 .../src/main/resources/common_tokens/urd           |    0
 .../src/main/resources/common_tokens/uzb           |    0
 .../src/main/resources/common_tokens/vie           |    0
 .../src/main/resources/common_tokens/vol           |    0
 .../src/main/resources/common_tokens/war           |    0
 .../src/main/resources/common_tokens/xho           |    0
 .../src/main/resources/common_tokens/yid           |    0
 .../src/main/resources/common_tokens/zul           |    0
 .../src/main/resources/log4j.properties            |    0
 .../src/main/resources/lucene-analyzers.json       |    0
 .../src/main/resources/lucene-char-mapping.txt     |    0
 .../apache/tika/eval/core}/langid/LangIdTest.java  |    4 +-
 .../core}/metadata/TikaEvalMetadataFilterTest.java |    2 +-
 .../tika/eval/core}/textstats/TextStatsTest.java   |    6 +-
 .../tika/eval/core}/tokens/LuceneTokenCounter.java |    2 +-
 .../tika/eval/core}/tokens/TokenCounterTest.java   |    3 +-
 .../tika/eval/core}/util/LanguageIdTest.java       |    4 +-
 .../apache/tika/eval/core}/util/MimeUtilTest.java  |    2 +-
 tika-example/pom.xml                               |    8 +-
 .../apache/tika/example/TextStatsFromTikaEval.java |    9 +-
 283 files changed, 152 insertions(+), 14136 deletions(-)

diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index a16c9dd..172424e 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -22,7 +22,6 @@
 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     <modelVersion>4.0.0</modelVersion>
-
     <parent>
         <groupId>org.apache.tika</groupId>
         <artifactId>tika-parent</artifactId>
@@ -34,219 +33,12 @@
     <name>Apache Tika eval</name>
     <url>http://tika.apache.org/</url>
 
-    <dependencies>
-        <dependency>
-            <groupId>${project.groupId}</groupId>
-            <artifactId>tika-core</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>${project.groupId}</groupId>
-            <artifactId>tika-batch</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>${project.groupId}</groupId>
-            <artifactId>tika-serialization</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>${project.groupId}</groupId>
-            <artifactId>tika-langdetect-opennlp</artifactId>
-            <version>${project.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-math3</artifactId>
-            <version>${commons.math3.version}</version>
-        </dependency>
-
-        <!-- check if exclusion from oss-index check
-            is necessary on next upgrade -->
-        <dependency>
-            <groupId>com.h2database</groupId>
-            <artifactId>h2</artifactId>
-            <version>${h2.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>commons-cli</groupId>
-            <artifactId>commons-cli</artifactId>
-            <version>${commons.cli.version}</version>
-        </dependency>
-
-        <dependency>
-            <groupId>org.apache.lucene</groupId>
-            <artifactId>lucene-core</artifactId>
-            <version>${lucene.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.lucene</groupId>
-            <artifactId>lucene-analyzers-common</artifactId>
-            <version>${lucene.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.lucene</groupId>
-            <artifactId>lucene-analyzers-icu</artifactId>
-            <version>${lucene.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.lucene</groupId>
-            <artifactId>lucene-memory</artifactId>
-            <version>${lucene.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.commons</groupId>
-            <artifactId>commons-lang3</artifactId>
-            <version>${commons.lang3.version}</version>
-        </dependency>
-
-        <dependency>
-            <groupId>org.apache.poi</groupId>
-            <artifactId>poi</artifactId>
-            <version>${poi.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.poi</groupId>
-            <artifactId>poi-ooxml</artifactId>
-            <version>${poi.version}</version>
-            <exclusions>
-                <exclusion>
-                    <groupId>org.apache.commons</groupId>
-                    <artifactId>commons-compress</artifactId>
-                </exclusion>
-            </exclusions>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.poi</groupId>
-            <artifactId>poi-ooxml-schemas</artifactId>
-            <version>${poi.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.poi</groupId>
-            <artifactId>poi-scratchpad</artifactId>
-            <version>${poi.version}</version>
-        </dependency>
-        <dependency>
-            <groupId>org.ccil.cowan.tagsoup</groupId>
-            <artifactId>tagsoup</artifactId>
-            <version>1.2.1</version>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.tika</groupId>
-            <artifactId>tika-batch</artifactId>
-            <version>${project.version}</version>
-            <type>test-jar</type>
-            <scope>test</scope>
-        </dependency>
-        <dependency>
-            <groupId>org.apache.tika</groupId>
-            <artifactId>tika-core</artifactId>
-            <version>${project.version}</version>
-            <type>test-jar</type>
-            <scope>test</scope>
-        </dependency>
+    <packaging>pom</packaging>
+    <modules>
+        <module>tika-eval-core</module>
+        <module>tika-eval-app</module>
+    </modules>
 
-        <dependency>
-            <groupId>junit</groupId>
-            <artifactId>junit</artifactId>
-            <scope>test</scope>
-        </dependency>
-    </dependencies>
 
-    <build>
-        <plugins>
-            <plugin>
-                <artifactId>maven-shade-plugin</artifactId>
-                <version>${maven.shade.version}</version>
-                <executions>
-                    <execution>
-                        <phase>package</phase>
-                        <goals>
-                            <goal>shade</goal>
-                        </goals>
-                        <configuration>
-                            <createDependencyReducedPom>
-                                false
-                            </createDependencyReducedPom>
-                            <!-- <filters> -->
-                            <transformers>
-                                <transformer
-                                        implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-                                    <mainClass>org.apache.tika.eval.TikaEvalCLI</mainClass>
-                                </transformer>
 
-                                <transformer
-                                        implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
-                            </transformers>
-                        </configuration>
-                    </execution>
-                </executions>
-            </plugin>
-            <plugin>
-                <groupId>org.apache.felix</groupId>
-                <artifactId>maven-bundle-plugin</artifactId>
-                <version>${maven.bundle.version}</version>
-                <extensions>true</extensions>
-                <configuration>
-                    <instructions>
-                        <Bundle-DocURL>${project.url}</Bundle-DocURL>
-                        <Bundle-Activator>
-                            org.apache.tika.config.TikaActivator
-                        </Bundle-Activator>
-                        <Bundle-ActivationPolicy>lazy</Bundle-ActivationPolicy>
-                    </instructions>
-                </configuration>
-            </plugin>
-            <plugin>
-                <groupId>org.apache.rat</groupId>
-                <artifactId>apache-rat-plugin</artifactId>
-                <version>${rat.version}</version>
-                <configuration>
-                    <excludes>
-                        <exclude>src/main/resources/common_tokens/*</exclude>
-                        <exclude>src/main/resources/*.json</exclude>
-                        <exclude>src/test/resources/common_tokens/*</exclude>
-                        <exclude>src/test/resources/test-dirs/**</exclude>
-                    </excludes>
-                </configuration>
-            </plugin>
-            <plugin>
-                <groupId>org.apache.maven.plugins</groupId>
-                <artifactId>maven-jar-plugin</artifactId>
-                <configuration>
-                    <archive>
-                        <manifestEntries>
-                            <Automatic-Module-Name>org.apache.tika.eval</Automatic-Module-Name>
-                        </manifestEntries>
-                    </archive>
-                </configuration>
-                <executions>
-                    <execution>
-                        <goals>
-                            <goal>test-jar</goal>
-                        </goals>
-                    </execution>
-                </executions>
-            </plugin>
-            <plugin>
-                <artifactId>maven-failsafe-plugin</artifactId>
-                <version>${maven.failsafe.version}</version>
-                <configuration>
-                    <additionalClasspathElements>
-                        <additionalClasspathElement>
-                            ${project.build.directory}/${project.build.finalName}.jar
-                        </additionalClasspathElement>
-                    </additionalClasspathElements>
-                </configuration>
-                <executions>
-                    <execution>
-                        <goals>
-                            <goal>integration-test</goal>
-                            <goal>verify</goal>
-                        </goals>
-                    </execution>
-                </executions>
-            </plugin>
-        </plugins>
-    </build>
 </project>
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
deleted file mode 100644
index 432543d..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ /dev/null
@@ -1,825 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval;
-
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.sql.Types;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.io.FilenameUtils;
-import org.apache.commons.lang3.mutable.MutableInt;
-import org.apache.commons.lang3.tuple.Pair;
-import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.batch.fs.FSProperties;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.eval.db.ColInfo;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.ExtractReaderException;
-import org.apache.tika.eval.io.IDBWriter;
-import org.apache.tika.eval.langid.LanguageIDWrapper;
-import org.apache.tika.eval.textstats.BasicTokenCountStatsCalculator;
-import org.apache.tika.eval.textstats.CommonTokens;
-import org.apache.tika.eval.textstats.CompositeTextStatsCalculator;
-import org.apache.tika.eval.textstats.ContentLengthCalculator;
-import org.apache.tika.eval.textstats.TextStatsCalculator;
-import org.apache.tika.eval.textstats.TokenEntropy;
-import org.apache.tika.eval.textstats.TokenLengths;
-import org.apache.tika.eval.textstats.TopNTokens;
-import org.apache.tika.eval.textstats.UnicodeBlockCounter;
-import org.apache.tika.eval.tokens.AnalyzerManager;
-import org.apache.tika.eval.tokens.CommonTokenCountManager;
-import org.apache.tika.eval.tokens.CommonTokenResult;
-import org.apache.tika.eval.tokens.TokenCounts;
-import org.apache.tika.eval.tokens.TokenIntPair;
-import org.apache.tika.eval.util.ContentTagParser;
-import org.apache.tika.eval.util.ContentTags;
-import org.apache.tika.eval.util.EvalExceptionUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.language.detect.LanguageResult;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.PagedText;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
-import org.apache.tika.sax.RecursiveParserWrapperHandler;
-import org.apache.tika.sax.ToXMLContentHandler;
-import org.apache.tika.utils.ExceptionUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.SAXException;
-
-public abstract class AbstractProfiler extends FileResourceConsumer {
-
-    private static final Logger LOG = LoggerFactory.getLogger(AbstractProfiler.class);
-
-    private static final String[] EXTRACT_EXTENSIONS = {
-            ".json",
-            ".txt",
-            ""
-    };
-
-    private static final String[] COMPRESSION_EXTENSIONS = {
-            "",
-            ".bz2",
-            ".gzip",
-            ".zip",
-    };
-    static final long NON_EXISTENT_FILE_LENGTH = -1l;
-
-    public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new TableInfo("ref_extract_exception_types",
-            new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER),
-            new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
-    );
-
-
-    public static TableInfo REF_PARSE_ERROR_TYPES = new TableInfo("ref_parse_error_types",
-            new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER),
-            new ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128)
-    );
-
-    public static TableInfo REF_PARSE_EXCEPTION_TYPES = new TableInfo("ref_parse_exception_types",
-            new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER),
-            new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
-    );
-
-    public static final String TRUE = Boolean.toString(true);
-    public static final String FALSE = Boolean.toString(false);
-    private static final String ZERO = "0";
-
-
-    protected static final AtomicInteger ID = new AtomicInteger();
-
-    private static final String UNKNOWN_EXTENSION = "unk";
-    //make this configurable
-    private static final String DIGEST_KEY = "X-TIKA:digest:MD5";
-
-    private static final Map<String, Cols> UC_TAGS_OF_INTEREST = initTags();
-
-    private static Map<String, Cols> initTags() {
-        //simplify this mess
-        Map<String, Cols> tmp = new HashMap<>();
-        tmp.put("A", Cols.TAGS_A);
-        tmp.put("B", Cols.TAGS_B);
-        tmp.put("DIV", Cols.TAGS_DIV);
-        tmp.put("I", Cols.TAGS_I);
-        tmp.put("IMG", Cols.TAGS_IMG);
-        tmp.put("LI", Cols.TAGS_LI);
-        tmp.put("OL", Cols.TAGS_OL);
-        tmp.put("P", Cols.TAGS_P);
-        tmp.put("TABLE", Cols.TAGS_TABLE);
-        tmp.put("TD", Cols.TAGS_TD);
-        tmp.put("TITLE", Cols.TAGS_TITLE);
-        tmp.put("TR", Cols.TAGS_TR);
-        tmp.put("U", Cols.TAGS_U);
-        tmp.put("UL", Cols.TAGS_UL);
-        return Collections.unmodifiableMap(tmp);
-    }
-
-    private static CommonTokenCountManager COMMON_TOKEN_COUNT_MANAGER;
-
-    private String lastExtractExtension = null;
-
-    AnalyzerManager analyzerManager;
-
-
-    public enum EXCEPTION_TYPE {
-        RUNTIME,
-        ENCRYPTION,
-        ACCESS_PERMISSION,
-        UNSUPPORTED_VERSION,
-    }
-
-    /**
-     * If information was gathered from the log file about
-     * a parse error
-     */
-    public enum PARSE_ERROR_TYPE {
-        OOM,
-        TIMEOUT
-    }
-
-    public static TableInfo MIME_TABLE = new TableInfo("mimes",
-            new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
-            new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
-            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
-    );
-
-    private static Pattern FILE_NAME_CLEANER = Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
-
-
-    final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for file_path
-    int maxContentLength = 10000000;
-    int maxContentLengthForLangId = 50000;
-    int maxTokens = 200000;
-
-
-
-    private final static Pattern ACCESS_PERMISSION_EXCEPTION =
-            Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException");
-    private final static Pattern ENCRYPTION_EXCEPTION =
-            Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException");
-
-    private static LanguageIDWrapper LANG_ID = new LanguageIDWrapper();
-
-    //TODO: allow configuration
-    //private TikaConfig config = TikaConfig.getDefaultConfig();
-    CompositeTextStatsCalculator compositeTextStatsCalculator;
-    protected IDBWriter writer;
-
-    /**
-     * @param p path to the common_tokens directory.  If this is null, try to load from classPath
-     * @param defaultLangCode this is the language code to use if a common_words list doesn't exist for the
-     *                        detected langauge; can be <code>null</code>
-     * @throws IOException
-     */
-    public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException {
-        COMMON_TOKEN_COUNT_MANAGER = new CommonTokenCountManager(p, defaultLangCode);
-    }
-
-    public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue,
-                            IDBWriter writer) {
-        super(fileQueue);
-        this.writer = writer;
-        LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
-        this.compositeTextStatsCalculator = initAnalyzersAndTokenCounter(maxTokens, LANG_ID);
-    }
-
-    private CompositeTextStatsCalculator initAnalyzersAndTokenCounter(int maxTokens, LanguageIDWrapper langIder) {
-        analyzerManager = AnalyzerManager.newInstance(maxTokens);
-        List<TextStatsCalculator> calculators = new ArrayList<>();
-        calculators.add(new CommonTokens(COMMON_TOKEN_COUNT_MANAGER));
-        calculators.add(new TokenEntropy());
-        calculators.add(new TokenLengths());
-        calculators.add(new TopNTokens(10));
-        calculators.add(new BasicTokenCountStatsCalculator());
-        calculators.add(new ContentLengthCalculator());
-        calculators.add(new UnicodeBlockCounter(maxContentLengthForLangId));
-
-        return new CompositeTextStatsCalculator(calculators, analyzerManager.getGeneralAnalyzer(), langIder);
-    }
-
-    /**
-     * Truncate the content string if greater than this length to this length
-     *
-     * @param maxContentLength
-     */
-    public void setMaxContentLength(int maxContentLength) {
-        this.maxContentLength = maxContentLength;
-    }
-
-    /**
-     * Truncate content string if greater than this length to this length for lang id
-     *
-     * @param maxContentLengthForLangId
-     */
-    public void setMaxContentLengthForLangId(int maxContentLengthForLangId) {
-        this.maxContentLengthForLangId = maxContentLengthForLangId;
-        LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
-    }
-
-    /**
-     * Add a LimitTokenCountFilterFactory if &gt; -1
-     *
-     * @param maxTokens
-     */
-    public void setMaxTokens(int maxTokens) {
-        this.maxTokens = maxTokens;
-        initAnalyzersAndTokenCounter(maxTokens, new LanguageIDWrapper());
-    }
-
-
-    protected void writeExtractException(TableInfo extractExceptionTable, String containerId,
-                                         String filePath, ExtractReaderException.TYPE type) throws IOException {
-        Map<Cols, String> data = new HashMap<>();
-        data.put(Cols.CONTAINER_ID, containerId);
-        data.put(Cols.FILE_PATH, filePath);
-        data.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(type.ordinal()));
-        writer.writeRow(extractExceptionTable, data);
-
-    }
-
-    protected void writeProfileData(EvalFilePaths fps, int i,
-                                    ContentTags contentTags, Metadata m,
-                                    String fileId, String containerId,
-                                    List<Integer> numAttachments, TableInfo profileTable) {
-
-        Map<Cols, String> data = new HashMap<>();
-        data.put(Cols.ID, fileId);
-        data.put(Cols.CONTAINER_ID, containerId);
-        data.put(Cols.MD5, m.get(DIGEST_KEY));
-
-        if (i < numAttachments.size()) {
-            data.put(Cols.NUM_ATTACHMENTS, Integer.toString(numAttachments.get(i)));
-        }
-        data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
-        data.put(Cols.NUM_METADATA_VALUES,
-                Integer.toString(countMetadataValues(m)));
-
-        Integer nPages = m.getInt(PagedText.N_PAGES);
-        if (nPages != null) {
-            data.put(Cols.NUM_PAGES, Integer.toString(nPages));
-        }
-
-        //if the outer wrapper document
-        if (i == 0) {
-            data.put(Cols.IS_EMBEDDED, FALSE);
-            data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString());
-        } else {
-            data.put(Cols.IS_EMBEDDED, TRUE);
-            data.put(Cols.FILE_NAME, getFileName(m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)));
-        }
-        String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
-        ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
-        data.put(Cols.FILE_EXTENSION, ext);
-        long srcFileLen = getSourceFileLength(m);
-        if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
-            data.put(Cols.LENGTH, Long.toString(srcFileLen));
-        } else {
-            data.put(Cols.LENGTH, "");
-        }
-        int numMetadataValues = countMetadataValues(m);
-        data.put(Cols.NUM_METADATA_VALUES,
-                Integer.toString(numMetadataValues));
-
-        data.put(Cols.ELAPSED_TIME_MILLIS,
-                getTime(m));
-
-        String content = contentTags.getContent();
-        if (content == null || content.trim().length() == 0) {
-            data.put(Cols.HAS_CONTENT, FALSE);
-        } else {
-            data.put(Cols.HAS_CONTENT, TRUE);
-        }
-        getFileTypes(m, data);
-        try {
-            writer.writeRow(profileTable, data);
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    private static String getFileName(String path) {
-        if (path == null) {
-            return "";
-        }
-        //filenameUtils checks for a null byte in the path.
-        //it will throw an IllegalArgumentException if there is a null byte.
-        //given that we're recording names and not using them on a file path
-        //we should ignore this.
-        try {
-            return FilenameUtils.getName(path);
-        } catch (IllegalArgumentException e) {
-            LOG.warn("{} in {}", e.getMessage(), path);
-        }
-        path = path.replaceAll("\u0000", " ");
-        try {
-            return FilenameUtils.getName(path);
-        } catch (IllegalArgumentException e) {
-            LOG.warn("Again: {} in {}", e.getMessage(), path);
-        }
-        //give up
-        return "";
-    }
-
-    protected void writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable) {
-        Map<Cols, String> data = new HashMap<>();
-        getExceptionStrings(m, data);
-        if (data.keySet().size() > 0) {
-            try {
-                data.put(Cols.ID, fileId);
-                writer.writeRow(exceptionTable, data);
-            } catch (IOException e) {
-                throw new RuntimeException(e);
-            }
-        }
-    }
-
-    protected Map<Class, Object> calcTextStats(ContentTags contentTags) {
-/*        if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) {
-            return Collections.EMPTY_MAP;
-        }*/
-        Map<Cols, String> data = new HashMap<>();
-        String content = truncateContent(contentTags, maxContentLength, data);
-        if (content == null || content.trim().length() == 0) {
-            content = "";
-        }
-        return compositeTextStatsCalculator.calculate(content);
-    }
-
-    /**
-     * Checks to see if metadata is null or content is empty (null or only whitespace).
-     * If any of these, then this does no processing, and the fileId is not
-     * entered into the content table.
-     *
-     * @param fileId
-     * @param textStats
-     * @param contentsTable
-     */
-    protected void writeContentData(String fileId, Map<Class, Object> textStats, TableInfo contentsTable) throws IOException {
-        Map<Cols, String> data = new HashMap<>();
-        data.put(Cols.ID, fileId);
-        if (textStats.containsKey(ContentLengthCalculator.class)) {
-            int length = (int)textStats.get(ContentLengthCalculator.class);
-            if (length == 0) {
-                return;
-            }
-            data.put(Cols.CONTENT_LENGTH, Integer.toString(length));
-        }
-        langid(textStats, data);
-
-        writeTokenCounts(textStats, data);
-        CommonTokenResult commonTokenResult = (CommonTokenResult) textStats.get(CommonTokens.class);
-        if (commonTokenResult != null) {
-            data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
-            data.put(Cols.NUM_UNIQUE_COMMON_TOKENS, Integer.toString(commonTokenResult.getUniqueCommonTokens()));
-            data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
-            data.put(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS,
-                    Integer.toString(commonTokenResult.getUniqueAlphabeticTokens()));
-            data.put(Cols.NUM_ALPHABETIC_TOKENS,
-                    Integer.toString(commonTokenResult.getAlphabeticTokens()));
-        }
-        TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class);
-        if (tokenCounts != null) {
-
-            data.put(Cols.NUM_UNIQUE_TOKENS,
-                    Integer.toString(tokenCounts.getTotalUniqueTokens()));
-            data.put(Cols.NUM_TOKENS,
-                    Integer.toString(tokenCounts.getTotalTokens()));
-        }
-        if (textStats.get(TokenEntropy.class) != null) {
-            data.put(Cols.TOKEN_ENTROPY_RATE,
-                    Double.toString((Double) textStats.get(TokenEntropy.class)));
-        }
-
-        SummaryStatistics summStats = (SummaryStatistics) textStats.get(TokenLengths.class);
-        if (summStats != null) {
-            data.put(Cols.TOKEN_LENGTH_SUM,
-                    Integer.toString((int) summStats.getSum()));
-
-            data.put(Cols.TOKEN_LENGTH_MEAN,
-                    Double.toString(summStats.getMean()));
-
-            data.put(Cols.TOKEN_LENGTH_STD_DEV,
-                    Double.toString(summStats.getStandardDeviation()));
-        }
-        unicodeBlocks(textStats, data);
-        try {
-            writer.writeRow(contentsTable, data);
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    void writeTagData(String fileId, ContentTags contentTags, TableInfo tagsTable) {
-        Map<String, Integer> tags = contentTags.getTags();
-        if (tags.size() == 0 && contentTags.getParseException() == false) {
-            return;
-        }
-        Map<Cols, String> data = new HashMap<>();
-        data.put(Cols.ID, fileId);
-
-        for (Map.Entry<String, Cols> e : UC_TAGS_OF_INTEREST.entrySet()) {
-            Integer count = tags.get(e.getKey());
-            if (count == null) {
-                data.put(e.getValue(), ZERO);
-            } else {
-                data.put(e.getValue(), Integer.toString(count));
-            }
-        }
-
-        if (contentTags.getParseException()) {
-            data.put(Cols.TAGS_PARSE_EXCEPTION, TRUE);
-        } else {
-            data.put(Cols.TAGS_PARSE_EXCEPTION, FALSE);
-        }
-        try {
-            writer.writeRow(tagsTable, data);
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-
-    String getTime(Metadata m) {
-        String elapsed = "-1";
-
-        String v = m.get(AbstractRecursiveParserWrapperHandler.PARSE_TIME_MILLIS);
-        if (v != null) {
-            return v;
-        }
-        return elapsed;
-    }
-
-    int countMetadataValues(Metadata m) {
-        if (m == null) {
-            return 0;
-        }
-        int i = 0;
-        for (String n : m.names()) {
-            i += m.getValues(n).length;
-        }
-        return i;
-    }
-
-    void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {
-
-        String fullTrace = metadata.get(RecursiveParserWrapperHandler.CONTAINER_EXCEPTION);
-
-        if (fullTrace == null) {
-            fullTrace = metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION);
-        }
-
-        if (fullTrace != null) {
-            //check for "expected" exceptions...exceptions
-            //that can't be fixed.
-            //Do not store trace for "expected" exceptions
-
-            Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace);
-            if (matcher.find()) {
-                data.put(Cols.PARSE_EXCEPTION_ID,
-                        Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()));
-                return;
-            }
-            matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace);
-            if (matcher.find()) {
-                data.put(Cols.PARSE_EXCEPTION_ID,
-                        Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal()));
-                return;
-            }
-
-            data.put(Cols.PARSE_EXCEPTION_ID,
-                    Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal()));
-
-            data.put(Cols.ORIG_STACK_TRACE, fullTrace);
-            //TikaExceptions can have object ids, as in the "@2b1ea6ee" in:
-            //org.apache.tika.exception.TikaException: TIKA-198: Illegal
-            //IOException from org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee
-            //For reporting purposes, let's snip off the object id so that we can more
-            //easily count exceptions.
-            String sortTrace = EvalExceptionUtils.normalize(fullTrace);
-            data.put(Cols.SORT_STACK_TRACE, sortTrace);
-        }
-    }
-
-    /**
-     * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
-     *
-     * @param contentTags
-     * @param maxLength
-     * @param data
-     * @return
-     */
-    protected static String truncateContent(ContentTags contentTags, int maxLength, Map<Cols, String> data) {
-        data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
-        if (contentTags == null) {
-            return "";
-        }
-        String c = contentTags.getContent();
-        if (maxLength > -1 && c.length() > maxLength) {
-            c = c.substring(0, maxLength);
-            data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
-        }
-        return c;
-
-    }
-
-    protected static ContentTags getContent(EvalFilePaths evalFilePaths, Metadata metadata) {
-        if (metadata == null) {
-            return ContentTags.EMPTY_CONTENT_TAGS;
-        }
-        return parseContentAndTags(evalFilePaths, metadata);
-    }
-
-    void unicodeBlocks(Map<Class, Object> tokenStats, Map<Cols, String> data) {
-
-        Map<String, MutableInt> blocks = (Map<String, MutableInt>) tokenStats.get(UnicodeBlockCounter.class);
-        List<Pair<String, Integer>> pairs = new ArrayList<>();
-        for (Map.Entry<String, MutableInt> e : blocks.entrySet()) {
-            pairs.add(Pair.of(e.getKey(), e.getValue().intValue()));
-        }
-        pairs.sort((o1, o2) -> o2.getValue().compareTo(o1.getValue()));
-        StringBuilder sb = new StringBuilder();
-
-        for (int i = 0; i < 20 && i < pairs.size(); i++) {
-            if (i > 0) {
-                sb.append(" | ");
-            }
-            sb.append(pairs.get(i).getKey() + ": " + pairs.get(i).getValue());
-        }
-        data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
-    }
-
-    void langid(Map<Class, Object> stats, Map<Cols, String> data) {
-        List<LanguageResult> probabilities = (List<LanguageResult>) stats.get(LanguageIDWrapper.class);
-
-        if (probabilities.size() > 0) {
-            data.put(Cols.LANG_ID_1, probabilities.get(0).getLanguage());
-            data.put(Cols.LANG_ID_PROB_1,
-                    Double.toString(probabilities.get(0).getRawScore()));
-        }
-        if (probabilities.size() > 1) {
-            data.put(Cols.LANG_ID_2, probabilities.get(1).getLanguage());
-            data.put(Cols.LANG_ID_PROB_2,
-                    Double.toString(probabilities.get(1).getRawScore()));
-        }
-    }
-
-    void getFileTypes(Metadata metadata, Map<Cols, String> output) {
-        if (metadata == null) {
-            return;
-        }
-        String type = metadata.get(Metadata.CONTENT_TYPE);
-        if (type == null) {
-            return;
-        }
-        int mimeId = writer.getMimeId(type);
-        output.put(Cols.MIME_ID, Integer.toString(mimeId));
-    }
-
-    void writeTokenCounts(Map<Class, Object> textStats, Map<Cols, String> data) {
-        TokenIntPair[] tokenIntPairs = (TokenIntPair[]) textStats.get(TopNTokens.class);
-        int i = 0;
-        StringBuilder sb = new StringBuilder();
-        for (TokenIntPair t : tokenIntPairs) {
-            if (i++ > 0) {
-                sb.append(" | ");
-            }
-            sb.append(t.getToken() + ": " + t.getValue());
-        }
-
-        data.put(Cols.TOP_N_TOKENS, sb.toString());
-    }
-
-
-    public void closeWriter() throws IOException {
-        writer.close();
-    }
-
-
-    /**
-     * @param metadata
-     * @param extracts
-     * @return evalfilepaths for files if crawling an extract directory
-     */
-    protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata,
-                                                     Path extracts) {
-        String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
-        Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
-        Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
-        //just try slapping the relextractfilepath on the extractdir
-        Path extractFile = extracts.resolve(relExtractFilePath);
-        if (!Files.isRegularFile(extractFile)) {
-            //if that doesn't work, try to find the right extract file.
-            //This is necessary if crawling extractsA and trying to find a file in
-            //extractsB that is not in the same format: json vs txt or compressed
-            extractFile = findFile(extracts, relativeSourceFilePath);
-        }
-        return new EvalFilePaths(relativeSourceFilePath, extractFile);
-    }
-
-    //call this if the crawler is crawling through the src directory
-    protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir,
-                                                 Path extracts) {
-        Path relativeSourceFilePath = Paths.get(metadata.get(FSProperties.FS_REL_PATH));
-        Path extractFile = findFile(extracts, relativeSourceFilePath);
-        Path inputFile = srcDir.resolve(relativeSourceFilePath);
-        long srcLen = -1l;
-        //try to get the length of the source file in case there was an error
-        //in both extracts
-        try {
-            srcLen = Files.size(inputFile);
-        } catch (IOException e) {
-            LOG.warn("Couldn't get length for: {}", inputFile.toAbsolutePath());
-        }
-        return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen);
-    }
-
-    /**
-     * @param extractRootDir
-     * @param relativeSourceFilePath
-     * @return extractFile or null if couldn't find one.
-     */
-    private Path findFile(Path extractRootDir, Path relativeSourceFilePath) {
-        String relSrcFilePathString = relativeSourceFilePath.toString();
-        if (lastExtractExtension != null) {
-            Path candidate = extractRootDir.resolve(relSrcFilePathString + lastExtractExtension);
-            if (Files.isRegularFile(candidate)) {
-                return candidate;
-            }
-        }
-        for (String ext : EXTRACT_EXTENSIONS) {
-            for (String compress : COMPRESSION_EXTENSIONS) {
-                Path candidate = extractRootDir.resolve(relSrcFilePathString + ext + compress);
-                if (Files.isRegularFile(candidate)) {
-                    lastExtractExtension = ext + compress;
-                    return candidate;
-                }
-            }
-        }
-        return null;
-    }
-
-    protected long getSourceFileLength(EvalFilePaths fps, List<Metadata> metadataList) {
-        if (fps.getSourceFileLength() > NON_EXISTENT_FILE_LENGTH) {
-            return fps.getSourceFileLength();
-        }
-        return getSourceFileLength(metadataList);
-    }
-
-    long getSourceFileLength(List<Metadata> metadataList) {
-        if (metadataList == null || metadataList.size() < 1) {
-            return NON_EXISTENT_FILE_LENGTH;
-        }
-        return getSourceFileLength(metadataList.get(0));
-    }
-
-    long getSourceFileLength(Metadata m) {
-        String lenString = m.get(Metadata.CONTENT_LENGTH);
-        if (lenString == null) {
-            return NON_EXISTENT_FILE_LENGTH;
-        }
-        try {
-            return Long.parseLong(lenString);
-        } catch (NumberFormatException e) {
-            //swallow
-        }
-        return NON_EXISTENT_FILE_LENGTH;
-    }
-
-    protected long getFileLength(Path p) {
-        if (p != null && Files.isRegularFile(p)) {
-            try {
-                return Files.size(p);
-            } catch (IOException e) {
-                //swallow
-            }
-        }
-        return NON_EXISTENT_FILE_LENGTH;
-    }
-
-    /**
-     * @param list
-     * @return empty list if input list is empty or null
-     */
-    static List<Integer> countAttachments(List<Metadata> list) {
-        List<Integer> ret = new ArrayList<>();
-        if (list == null || list.size() == 0) {
-            return ret;
-        }
-        //container document attachment count = list.size()-1
-        ret.add(list.size() - 1);
-
-        Map<String, Integer> counts = new HashMap<>();
-        for (int i = 1; i < list.size(); i++) {
-            String path = list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
-            if (path == null) {
-                //shouldn't ever happen
-                continue;
-            }
-            String[] parts = path.split("/");
-            StringBuilder parent = new StringBuilder();
-            for (int end = 1; end < parts.length - 1; end++) {
-                parent.setLength(0);
-                join("/", parent, parts, 1, end);
-                String parentPath = parent.toString();
-                Integer count = counts.get(parentPath);
-                if (count == null) {
-                    count = 1;
-                } else {
-                    count++;
-                }
-                counts.put(parentPath, count);
-            }
-        }
-
-        for (int i = 1; i < list.size(); i++) {
-            Integer count = counts.get(list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
-            if (count == null) {
-                count = 0;
-            }
-            ret.add(i, count);
-        }
-        return ret;
-
-
-    }
-
-    private static void join(String delimiter, StringBuilder sb, String[] parts, int start, int end) {
-        for (int i = start; i <= end; i++) {
-            sb.append(delimiter);
-            sb.append(parts[i]);
-        }
-    }
-
-    private static ContentTags parseContentAndTags(EvalFilePaths evalFilePaths, Metadata metadata) {
-        String s = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT);
-        if (s == null || s.length() == 0) {
-            return ContentTags.EMPTY_CONTENT_TAGS;
-        }
-
-        String handlerClass = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER);
-        if (evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".html")) {
-            try {
-                return ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
-            } catch (IOException | SAXException e) {
-                LOG.warn("Problem parsing html in {}; backing off to treat string as text",
-                        evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
-
-                return new ContentTags(s, true);
-            }
-        } else if (
-                evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".xhtml") ||
-                        (handlerClass != null && handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
-            try {
-                return ContentTagParser.parseXML(s, UC_TAGS_OF_INTEREST.keySet());
-            } catch (TikaException | IOException | SAXException e) {
-                LOG.warn("Problem parsing xhtml in {}; backing off to html parser",
-                        evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
-                try {
-                    ContentTags contentTags = ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
-                    contentTags.setParseException(true);
-                    return contentTags;
-                } catch (IOException | SAXException e2) {
-                    LOG.warn("Problem parsing html in {}; backing off to treat string as text",
-                            evalFilePaths.getExtractFile().toAbsolutePath().toString(), e2);
-                }
-                return new ContentTags(s, true);
-            }
-        }
-        return new ContentTags(s);
-    }
-
-
-}
-
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java b/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java
deleted file mode 100644
index a760b86..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval;
-
-import static org.apache.tika.eval.AbstractProfiler.NON_EXISTENT_FILE_LENGTH;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Objects;
-
-/**
- * Simple struct to keep track of relative path of source file (
- * original binary file, e.g. /subdir/document1.doc)
- * and the extract file (e.g. /subdir/document1.doc.json).
- */
-class EvalFilePaths {
-
-    private final Path relativeSourceFilePath;
-    private final Path extractFile;
-
-    private long sourceFileLength = NON_EXISTENT_FILE_LENGTH;
-    private long extractFileLength = NON_EXISTENT_FILE_LENGTH;
-
-
-    public EvalFilePaths(Path relativeSourceFilePath, Path extractFile, long srcFileLen) {
-        this(relativeSourceFilePath, extractFile);
-        this.sourceFileLength = srcFileLen;
-    }
-
-    public EvalFilePaths(Path relativeSourceFilePath, Path extractFile) {
-        if (extractFile != null && Files.isRegularFile(extractFile)) {
-            try {
-                extractFileLength = Files.size(extractFile);
-            } catch (IOException e) {
-                //swallow ?
-            }
-        }
-        this.relativeSourceFilePath = relativeSourceFilePath;
-        this.extractFile = extractFile;
-    }
-
-    public Path getRelativeSourceFilePath() {
-        return relativeSourceFilePath;
-    }
-
-    //this path may or may not exist and it could be null!
-    public Path getExtractFile() {
-        return extractFile;
-    }
-
-    //if it doesn't exist, it'll be -1l.
-    public long getSourceFileLength() {
-        return sourceFileLength;
-    }
-
-    public long getExtractFileLength() {
-        return extractFileLength;
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (o == null || getClass() != o.getClass()) {
-            return false;
-        }
-
-        EvalFilePaths that = (EvalFilePaths) o;
-
-        if (sourceFileLength != that.sourceFileLength) {
-            return false;
-        }
-        if (extractFileLength != that.extractFileLength) {
-            return false;
-        }
-        if (!Objects.equals(relativeSourceFilePath, that.relativeSourceFilePath)) {
-            return false;
-        }
-        return Objects.equals(extractFile, that.extractFile);
-
-    }
-
-    @Override
-    public int hashCode() {
-        int result = relativeSourceFilePath != null ? relativeSourceFilePath.hashCode() : 0;
-        result = 31 * result + (extractFile != null ? extractFile.hashCode() : 0);
-        result = 31 * result + (int) (sourceFileLength ^ (sourceFileLength >>> 32));
-        result = 31 * result + (int) (extractFileLength ^ (extractFileLength >>> 32));
-        return result;
-    }
-
-    @Override
-    public String toString() {
-        return "EvalFilePaths{" +
-                "relativeSourceFilePath=" + relativeSourceFilePath +
-                ", extractFile=" + extractFile +
-                ", sourceFileLength=" + sourceFileLength +
-                ", extractFileLength=" + extractFileLength +
-                '}';
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
deleted file mode 100644
index 79df621..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ /dev/null
@@ -1,538 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.Types;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.ArrayBlockingQueue;
-
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.commons.io.FilenameUtils;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.fs.FSProperties;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.eval.db.ColInfo;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.ExtractReader;
-import org.apache.tika.eval.io.ExtractReaderException;
-import org.apache.tika.eval.io.IDBWriter;
-import org.apache.tika.eval.textstats.BasicTokenCountStatsCalculator;
-import org.apache.tika.eval.tokens.ContrastStatistics;
-import org.apache.tika.eval.tokens.TokenContraster;
-import org.apache.tika.eval.tokens.TokenCounts;
-import org.apache.tika.eval.tokens.TokenIntPair;
-import org.apache.tika.eval.util.ContentTags;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
-
-public class ExtractComparer extends AbstractProfiler {
-
-    static Options OPTIONS;
-    static {
-        Option extractsA = new Option("extractsA", true, "directory for extractsA files");
-        extractsA.setRequired(true);
-
-        Option extractsB = new Option("extractsB", true, "directory for extractsB files");
-        extractsB.setRequired(true);
-
-        Option inputDir = new Option("inputDir", true,
-                "optional: directory of original binary input files if it exists " +
-                        "or can be the same as -extractsA or -extractsB. If not specified, -inputDir=-extractsA");
-
-
-        OPTIONS = new Options()
-                .addOption(extractsA)
-                .addOption(extractsB)
-                .addOption(inputDir)
-                .addOption("bc", "optional: tika-batch config file")
-                .addOption("numConsumers", true, "optional: number of consumer threads")
-                .addOption(new Option("alterExtract", true,
-                        "for json-formatted extract files, " +
-                                "process full metadata list ('as_is'=default), " +
-                                "take just the first/container document ('first_only'), " +
-                                "concatenate all content into the first metadata item ('concatenate_content')"))
-                .addOption("minExtractLength", true, "minimum extract length to process (in bytes)")
-                .addOption("maxExtractLength", true, "maximum extract length to process (in bytes)")
-                .addOption("db", true, "db file to which to write results")
-                .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
-                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
-                .addOption("tablePrefixA", true, "EXPERT: optional prefix for table names for A")
-                .addOption("tablePrefixB", true, "EXPERT: optional prefix for table names for B")
-                .addOption("drop", false, "drop tables if they exist")
-                .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
-                .addOption("maxTokens", true, "maximum tokens to process, default=200000")
-                .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
-                .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
-                .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")
-        ;
-    }
-
-    private static final String DIGEST_KEY_PREFIX = TikaCoreProperties.TIKA_META_PREFIX+
-            "digest"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
-
-    public static void USAGE() {
-        HelpFormatter helpFormatter = new HelpFormatter();
-        helpFormatter.printHelp(
-                80,
-                "java -jar tika-eval-x.y.jar Compare -extractsA extractsA -extractsB extractsB -db mydb",
-                "Tool: Compare",
-                ExtractComparer.OPTIONS,
-                "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
-    }
-
-    private final static String FIELD_A = "fa";
-    private final static String FIELD_B = "fb";
-
-    public static TableInfo REF_PAIR_NAMES = new TableInfo("pair_names",
-            new ColInfo(Cols.DIR_NAME_A, Types.VARCHAR, 128),
-            new ColInfo(Cols.DIR_NAME_B, Types.VARCHAR, 128)
-    );
-
-    public static TableInfo COMPARISON_CONTAINERS = new TableInfo("containers",
-            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"),
-            new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
-            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
-            new ColInfo(Cols.LENGTH, Types.BIGINT),
-            new ColInfo(Cols.EXTRACT_FILE_LENGTH_A, Types.BIGINT),
-            new ColInfo(Cols.EXTRACT_FILE_LENGTH_B, Types.BIGINT)
-    );
-
-    public static TableInfo CONTENT_COMPARISONS = new TableInfo("content_comparisons",
-            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
-            new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, Types.VARCHAR, 1024),
-            new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, Types.VARCHAR, 1024),
-            new ColInfo(Cols.TOP_10_MORE_IN_A, Types.VARCHAR, 1024),
-            new ColInfo(Cols.TOP_10_MORE_IN_B, Types.VARCHAR, 1024),
-            new ColInfo(Cols.DICE_COEFFICIENT, Types.FLOAT),
-            new ColInfo(Cols.OVERLAP, Types.FLOAT)
-    );
-
-    public static TableInfo PROFILES_A = new TableInfo( "profiles_a",
-            ExtractProfiler.PROFILE_TABLE.getColInfos());
-
-    public static TableInfo PROFILES_B = new TableInfo( "profiles_b",
-            ExtractProfiler.PROFILE_TABLE.getColInfos());
-
-    public static TableInfo EMBEDDED_FILE_PATH_TABLE_A = new TableInfo( "emb_path_a",
-            ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
-
-    public static TableInfo EMBEDDED_FILE_PATH_TABLE_B = new TableInfo( "emb_path_b",
-            ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
-
-
-    public static TableInfo CONTENTS_TABLE_A = new TableInfo( "contents_a",
-            ExtractProfiler.CONTENTS_TABLE.getColInfos());
-
-    public static TableInfo CONTENTS_TABLE_B = new TableInfo( "contents_b",
-            ExtractProfiler.CONTENTS_TABLE.getColInfos());
-
-    public static TableInfo TAGS_TABLE_A = new TableInfo( "tags_a",
-            ExtractProfiler.TAGS_TABLE.getColInfos());
-
-    public static TableInfo TAGS_TABLE_B = new TableInfo( "tags_b",
-            ExtractProfiler.TAGS_TABLE.getColInfos());
-
-
-    public static TableInfo EXCEPTION_TABLE_A = new TableInfo ("exceptions_a",
-            ExtractProfiler.EXCEPTION_TABLE.getColInfos());
-
-    public static TableInfo EXCEPTION_TABLE_B = new TableInfo ("exceptions_b",
-            ExtractProfiler.EXCEPTION_TABLE.getColInfos());
-
-    public static TableInfo EXTRACT_EXCEPTION_TABLE_A = new TableInfo("extract_exceptions_a",
-            ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
-    public static TableInfo EXTRACT_EXCEPTION_TABLE_B = new TableInfo("extract_exceptions_b",
-            ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
-
-
-    //need to parameterize?
-    private final TikaConfig config = TikaConfig.getDefaultConfig();
-
-    private final Path inputDir;
-    private final Path extractsA;
-    private final Path extractsB;
-
-    private final TokenContraster tokenContraster = new TokenContraster();
-    private final ExtractReader extractReader;
-
-    public ExtractComparer(ArrayBlockingQueue<FileResource> queue,
-                           Path inputDir, Path extractsA, Path extractsB, ExtractReader extractReader,
-                           IDBWriter writer) {
-        super(queue, writer);
-        this.inputDir = inputDir;
-        this.extractsA = extractsA;
-        this.extractsB = extractsB;
-        this.extractReader = extractReader;
-    }
-
-    @Override
-    public boolean processFileResource(FileResource fileResource) {
-        Metadata metadata = fileResource.getMetadata();
-        EvalFilePaths fpsA = null;
-        EvalFilePaths fpsB = null;
-
-        if (inputDir != null && (inputDir.equals(extractsA) ||
-                inputDir.equals(extractsB))) {
-            //crawling an extract dir
-            fpsA = getPathsFromExtractCrawl(metadata, extractsA);
-            fpsB = getPathsFromExtractCrawl(metadata, extractsB);
-
-        } else {
-            fpsA = getPathsFromSrcCrawl(metadata, inputDir, extractsA);
-            fpsB = getPathsFromSrcCrawl(metadata, inputDir, extractsB);
-        }
-
-        try {
-            compareFiles(fpsA, fpsB);
-        } catch (Throwable e) {
-            //this should be cataclysmic...
-            throw new RuntimeException("Exception while working on: " +
-                    metadata.get(FSProperties.FS_REL_PATH), e);
-        }
-        return true;
-    }
-
-    //protected for testing, should find better way so that this can be private!
-    protected void compareFiles(EvalFilePaths fpsA,
-                                EvalFilePaths fpsB) throws IOException {
-
-        ExtractReaderException.TYPE extractExceptionA = null;
-        ExtractReaderException.TYPE extractExceptionB = null;
-
-        List<Metadata> metadataListA = null;
-        if (extractExceptionA == null) {
-            try {
-                metadataListA = extractReader.loadExtract(fpsA.getExtractFile());
-            } catch (ExtractReaderException e) {
-                extractExceptionA = e.getType();
-            }
-        }
-
-        List<Metadata> metadataListB = null;
-        try {
-            metadataListB = extractReader.loadExtract(fpsB.getExtractFile());
-        } catch (ExtractReaderException e) {
-            extractExceptionB = e.getType();
-        }
-
-        //array indices for those metadata items handled in B
-        Set<Integer> handledB = new HashSet<>();
-        String containerID = Integer.toString(ID.getAndIncrement());
-        //container table
-        Map<Cols, String> contData = new HashMap<>();
-        contData.put(Cols.CONTAINER_ID, containerID);
-        contData.put(Cols.FILE_PATH, fpsA.getRelativeSourceFilePath().toString());
-        long srcFileLength = getSourceFileLength(metadataListA, metadataListB);
-        contData.put(Cols.LENGTH,
-                srcFileLength > NON_EXISTENT_FILE_LENGTH ?
-                    Long.toString(srcFileLength) : "");
-        contData.put(Cols.FILE_EXTENSION,
-                FilenameUtils.getExtension(fpsA.getRelativeSourceFilePath().getFileName().toString()));
-
-        long extractFileLengthA = getFileLength(fpsA.getExtractFile());
-        contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > NON_EXISTENT_FILE_LENGTH ?
-                Long.toString(extractFileLengthA) : "");
-
-        long extractFileLengthB = getFileLength(fpsB.getExtractFile());
-        contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > NON_EXISTENT_FILE_LENGTH ?
-                Long.toString(extractFileLengthB) : "");
-
-        writer.writeRow(COMPARISON_CONTAINERS, contData);
-
-        if (extractExceptionA != null) {
-            writeExtractException(EXTRACT_EXCEPTION_TABLE_A, containerID, fpsA.getRelativeSourceFilePath().toString(),
-                    extractExceptionA);
-        }
-        if (extractExceptionB != null) {
-            writeExtractException(EXTRACT_EXCEPTION_TABLE_B, containerID, fpsB.getRelativeSourceFilePath().toString(),
-                    extractExceptionB);
-        }
-
-        if (metadataListA == null && metadataListB == null) {
-            return;
-        }
-        List<Integer> numAttachmentsA = countAttachments(metadataListA);
-        List<Integer> numAttachmentsB = countAttachments(metadataListB);
-
-        String sharedDigestKey = findSharedDigestKey(metadataListA, metadataListB);
-        Map<Class, Object> tokenStatsA = null;
-        Map<Class, Object> tokenStatsB = null;
-        //now get that metadata
-        if (metadataListA != null) {
-            for (int i = 0; i < metadataListA.size(); i++) {
-                //the first file should have the same id as the container id
-                String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
-                Metadata metadataA = metadataListA.get(i);
-                ContentTags contentTagsA = getContent(fpsA, metadataA);
-                ContentTags contentTagsB = ContentTags.EMPTY_CONTENT_TAGS;
-                Metadata metadataB = null;
-
-                //TODO: shouldn't be fileA!!!!
-                writeTagData(fileId, contentTagsA, TAGS_TABLE_A);
-
-                writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
-                writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
-                int matchIndex = getMatch(i, sharedDigestKey,
-                        handledB, metadataListA, metadataListB);
-
-                if (matchIndex > -1 && ! handledB.contains(matchIndex)) {
-                    metadataB = metadataListB.get(matchIndex);
-                    handledB.add(matchIndex);
-                }
-                if (metadataB != null) {
-                    contentTagsB = getContent(fpsB, metadataB);
-                    writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
-                    writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
-                    writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
-                }
-                writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
-                //write content
-                try {
-                    tokenStatsA = calcTextStats(contentTagsA);
-                    writeContentData(fileId, tokenStatsA, CONTENTS_TABLE_A);
-                    tokenStatsB = calcTextStats(contentTagsB);
-                    if (metadataB != null) {
-                        writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
-                    }
-                } catch (IOException e) {
-                    throw new RuntimeException(e);
-                }
-                if (metadataB != null) {
-                    TokenCounts tokenCountsA = (TokenCounts) tokenStatsA.get(BasicTokenCountStatsCalculator.class);
-                    TokenCounts tokenCountsB = (TokenCounts) tokenStatsB.get(BasicTokenCountStatsCalculator.class);
-                    //arbitrary decision...only run the comparisons if there are > 10 tokens total
-                    //We may want to bump that value a bit higher?
-                    //now run comparisons
-                    if (tokenCountsA.getTotalTokens()
-                            + tokenCountsB.getTotalTokens() > 10) {
-                        Map<Cols, String> data = new HashMap<>();
-                        data.put(Cols.ID, fileId);
-
-                        ContrastStatistics contrastStatistics =
-                                tokenContraster.calculateContrastStatistics(
-                                        tokenCountsA,
-                                        tokenCountsB);
-
-                        writeContrasts(data, contrastStatistics);
-                        writer.writeRow(CONTENT_COMPARISONS, data);
-                    }
-                }
-            }
-        }
-        //now try to get any Metadata objects in B
-        //that haven't yet been handled.
-        if (metadataListB != null) {
-            for (int i = 0; i < metadataListB.size(); i++) {
-                if (handledB.contains(i)) {
-                    continue;
-                }
-                Metadata metadataB = metadataListB.get(i);
-                ContentTags contentTagsB = getContent(fpsB, metadataB);
-                //the first file should have the same id as the container id
-                String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
-                writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
-                writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
-                writeEmbeddedFilePathData(i, fileId, null, metadataB);
-                writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
-
-                //write content
-                try {
-                    tokenStatsB = calcTextStats(contentTagsB);
-                    writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
-                } catch (IOException e) {
-                    throw new RuntimeException(e);
-                }
-            }
-        }
-    }
-
-    /**
-     * Checks only the first item in each list. Returns the first
-     * digest key shared by both, if it exists, null otherwise.
-     * @param metadataListA
-     * @param metadataListB
-     * @return
-     */
-    private String findSharedDigestKey(List<Metadata> metadataListA, List<Metadata> metadataListB) {
-        if (metadataListB == null || metadataListB.size() == 0) {
-            return null;
-        }
-        Set<String> digestA = new HashSet<>();
-        if (metadataListA != null) {
-            for (String n : metadataListA.get(0).names()) {
-                if (n.startsWith(DIGEST_KEY_PREFIX)) {
-                    digestA.add(n);
-                }
-            }
-        }
-        Metadata bMain = metadataListB.get(0);
-        for (String n : bMain.names()) {
-            if (digestA.contains(n)) {
-                return n;
-            }
-        }
-        return null;
-    }
-
-    private void writeEmbeddedFilePathData(int i, String fileId, Metadata mA, Metadata mB) {
-        //container file, don't write anything
-        if (i == 0) {
-            return;
-        }
-        String pathA = null;
-        String pathB = null;
-        if (mA != null) {
-            pathA = mA.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
-        }
-        if (mB != null) {
-            pathB = mB.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
-        }
-        if (pathA != null) {
-            Map<Cols, String> d = new HashMap<>();
-            d.put(Cols.ID, fileId);
-            d.put(Cols.EMBEDDED_FILE_PATH, pathA);
-            try {
-                writer.writeRow(EMBEDDED_FILE_PATH_TABLE_A, d);
-            } catch (IOException e) {
-                throw new RuntimeException(e);
-            }
-        }
-        if (pathB != null &&
-                (pathA == null || ! pathA.equals(pathB))) {
-            Map<Cols, String> d = new HashMap<>();
-            d.put(Cols.ID, fileId);
-            d.put(Cols.EMBEDDED_FILE_PATH, pathB);
-            try {
-                writer.writeRow(EMBEDDED_FILE_PATH_TABLE_B, d);
-            } catch (IOException e) {
-                throw new RuntimeException(e);
-            }
-        }
-    }
-
-    private long getSourceFileLength(List<Metadata> metadataListA, List<Metadata> metadataListB) {
-        long len = getSourceFileLength(metadataListA);
-        if (len > NON_EXISTENT_FILE_LENGTH) {
-            return len;
-        }
-        return getSourceFileLength(metadataListB);
-    }
-
-
-    /**
-     * Try to find the matching metadata based on the AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH
-     * If you can't find it, return -1;
-     *
-     * @param aIndex                index for match in metadataListA
-     * @param metadataListA
-     * @param metadataListB
-     * @return
-     */
-    private int getMatch(int aIndex, String sharedDigestKey, Set<Integer> handledB,
-                         List<Metadata> metadataListA,
-                         List<Metadata> metadataListB) {
-        //TODO: could make this more robust
-        if (metadataListB == null || metadataListB.size() == 0) {
-            return -1;
-        }
-        //assume first is always the container file
-        if (aIndex == 0) {
-            return 0;
-        }
-
-        if (sharedDigestKey != null) {
-            //first try to find matching digests
-            //this does not elegantly handle multiple matching digests
-            return findMatchingDigests(sharedDigestKey, handledB,
-                    metadataListA.get(aIndex), metadataListB);
-        }
-
-        //assume same embedded resource path.  Not always true!
-        Metadata thisMetadata = metadataListA.get(aIndex);
-        String embeddedPath = thisMetadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
-        if (embeddedPath != null) {
-            for (int j = 0; j < metadataListB.size(); j++) {
-                String thatEmbeddedPath = metadataListB.get(j).get(
-                        AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
-                if (embeddedPath.equals(thatEmbeddedPath)) {
-                    return j;
-                }
-            }
-        }
-
-        //last resort, if lists are same size, guess the same index
-        if (metadataListA.size() == metadataListB.size()) {
-            //assume no rearrangments if lists are the same size
-            return aIndex;
-        }
-        return -1;
-    }
-
-    private int findMatchingDigests(String sharedDigestKey,
-                                    Set<Integer> handledB,
-                                    Metadata metadata, List<Metadata> metadataListB) {
-        String digestA = metadata.get(sharedDigestKey);
-        if (digestA == null) {
-            return -1;
-        }
-
-        for (int i = 0; i < metadataListB.size(); i++)  {
-            if (handledB.contains(i)) {
-                continue;
-            }
-            Metadata mB = metadataListB.get(i);
-            String digestB = mB.get(sharedDigestKey);
-            if (digestA.equalsIgnoreCase(digestB)) {
-                return i;
-            }
-        }
-        return -1;
-    }
-
-    private void writeContrasts(Map<Cols, String> data, ContrastStatistics contrastStatistics) {
-        writeContrastString(data, Cols.TOP_10_MORE_IN_A, contrastStatistics.getTopNMoreA());
-        writeContrastString(data, Cols.TOP_10_MORE_IN_B, contrastStatistics.getTopNMoreB());
-        writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, contrastStatistics.getTopNUniqueA());
-        writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, contrastStatistics.getTopNUniqueB());
-        data.put(Cols.OVERLAP, Double.toString(contrastStatistics.getOverlap()));
-        data.put(Cols.DICE_COEFFICIENT, Double.toString(contrastStatistics.getDiceCoefficient()));
-
-    }
-
-    private void writeContrastString(Map<Cols, String> data, Cols col, TokenIntPair[] tokenIntPairs) {
-
-        int i = 0;
-        StringBuilder sb = new StringBuilder();
-        for (TokenIntPair p : tokenIntPairs) {
-            if (i++ > 0) {
-                sb.append(" | ");
-            }
-            sb.append(p.getToken()).append(": ").append(p.getValue());
-        }
-        data.put(col, sb.toString());
-    }
-}
\ No newline at end of file
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
deleted file mode 100644
index 5fad576..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.Types;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.eval.db.ColInfo;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.ExtractReader;
-import org.apache.tika.eval.io.ExtractReaderException;
-import org.apache.tika.eval.io.IDBWriter;
-import org.apache.tika.eval.util.ContentTags;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
-
-public class ExtractProfiler extends AbstractProfiler {
-
-    static Options OPTIONS;
-    static {
-        //By the time this commandline is parsed, there should be both an extracts and an inputDir
-        Option extracts = new Option("extracts", true, "directory for extract files");
-        extracts.setRequired(true);
-
-        Option inputDir = new Option("inputDir", true,
-                "optional: directory for original binary input documents."+
-        " If not specified, -extracts is crawled as is.");
-
-        OPTIONS = new Options()
-                .addOption(extracts)
-                .addOption(inputDir)
-                .addOption("bc", "optional: tika-batch config file")
-                .addOption("numConsumers", true, "optional: number of consumer threads")
-                .addOption(new Option("alterExtract", true,
-                        "for json-formatted extract files, " +
-                                "process full metadata list ('as_is'=default), " +
-                                "take just the first/container document ('first_only'), " +
-                                "concatenate all content into the first metadata item ('concatenate_content')"))
-                .addOption("minExtractLength", true, "minimum extract length to process (in bytes)")
-                .addOption("maxExtractLength", true, "maximum extract length to process (in bytes)")
-                .addOption("db", true, "db file to which to write results")
-                .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
-                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
-                .addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
-                .addOption("drop", false, "drop tables if they exist")
-                .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
-                .addOption("maxTokens", true, "maximum tokens to process, default=200000")
-                .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
-                .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
-                .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")
-
-        ;
-
-    }
-
-    public static void USAGE() {
-        HelpFormatter helpFormatter = new HelpFormatter();
-        helpFormatter.printHelp(
-                80,
-                "java -jar tika-eval-x.y.jar Profile -extracts extracts -db mydb [-inputDir input]",
-                "Tool: Profile",
-                ExtractProfiler.OPTIONS,
-                "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
-    }
-
-    private final static String FIELD = "f";
-
-    public static TableInfo EXTRACT_EXCEPTION_TABLE = new TableInfo("extract_exceptions",
-            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER),
-            new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
-            new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER),
-            new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER)
-    );
-
-    public static TableInfo EXCEPTION_TABLE = new TableInfo("parse_exceptions",
-            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
-            new ColInfo(Cols.ORIG_STACK_TRACE, Types.VARCHAR, 8192),
-            new ColInfo(Cols.SORT_STACK_TRACE, Types.VARCHAR, 8192),
-            new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER)
-    );
-
-
-    public static TableInfo CONTAINER_TABLE = new TableInfo("containers",
-            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"),
-            new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
-            new ColInfo(Cols.LENGTH, Types.BIGINT),
-            new ColInfo(Cols.EXTRACT_FILE_LENGTH, Types.BIGINT)
-    );
-
-    public static TableInfo PROFILE_TABLE = new TableInfo("profiles",
-            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
-            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER),
-            new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 256),
-            new ColInfo(Cols.MD5, Types.CHAR, 32),
-            new ColInfo(Cols.LENGTH, Types.BIGINT),
-            new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN),
-            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
-            new ColInfo(Cols.MIME_ID, Types.INTEGER),
-            new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER),
-            new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER),
-            new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER),
-            new ColInfo(Cols.NUM_PAGES, Types.INTEGER),
-            new ColInfo(Cols.HAS_CONTENT, Types.BOOLEAN)
-    );
-
-    public static TableInfo EMBEDDED_FILE_PATH_TABLE = new TableInfo("emb_file_names",
-            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
-            new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024)
-    );
-
-    public static TableInfo CONTENTS_TABLE = new TableInfo("contents",
-            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
-            new ColInfo(Cols.CONTENT_LENGTH, Types.INTEGER),
-            new ColInfo(Cols.NUM_UNIQUE_TOKENS, Types.INTEGER),
-            new ColInfo(Cols.NUM_TOKENS, Types.INTEGER),
-            new ColInfo(Cols.COMMON_TOKENS_LANG, Types.VARCHAR, 12),
-            new ColInfo(Cols.NUM_UNIQUE_COMMON_TOKENS, Types.INTEGER),
-            new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER),
-            new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER),
-            new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER),
-            new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024),
-            new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12),
-            new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT),
-            new ColInfo(Cols.LANG_ID_2, Types.VARCHAR, 12),
-            new ColInfo(Cols.LANG_ID_PROB_2, Types.FLOAT),
-            new ColInfo(Cols.UNICODE_CHAR_BLOCKS, Types.VARCHAR, 1024),
-            new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT),
-            new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER),
-            new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT),
-            new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT),
-            new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)
-    );
-
-    public static TableInfo TAGS_TABLE = new TableInfo("tags",
-            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
-            new ColInfo(Cols.TAGS_A, Types.INTEGER),
-            new ColInfo(Cols.TAGS_B, Types.INTEGER),
-            new ColInfo(Cols.TAGS_DIV, Types.INTEGER),
-            new ColInfo(Cols.TAGS_I, Types.INTEGER),
-            new ColInfo(Cols.TAGS_IMG, Types.INTEGER),
-            new ColInfo(Cols.TAGS_LI, Types.INTEGER),
-            new ColInfo(Cols.TAGS_OL, Types.INTEGER),
-            new ColInfo(Cols.TAGS_P, Types.INTEGER),
-            new ColInfo(Cols.TAGS_TABLE, Types.INTEGER),
-            new ColInfo(Cols.TAGS_TD, Types.INTEGER),
-            new ColInfo(Cols.TAGS_TITLE, Types.INTEGER),
-            new ColInfo(Cols.TAGS_TR, Types.INTEGER),
-            new ColInfo(Cols.TAGS_U, Types.INTEGER),
-            new ColInfo(Cols.TAGS_UL, Types.INTEGER),
-            new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN)
-    );
-
-    private final Path inputDir;
-    private final Path extracts;
-    private final ExtractReader extractReader;
-
-    public ExtractProfiler(ArrayBlockingQueue<FileResource> queue,
-                           Path inputDir, Path extracts,
-                           ExtractReader extractReader, IDBWriter dbWriter) {
-        super(queue, dbWriter);
-        this.inputDir = inputDir;
-        this.extracts = extracts;
-        this.extractReader = extractReader;
-    }
-
-    @Override
-    public boolean processFileResource(FileResource fileResource) {
-        Metadata metadata = fileResource.getMetadata();
-        EvalFilePaths fps = null;
-
-        if (inputDir != null && inputDir.equals(extracts)) {
-            //crawling an extract dir
-            fps = getPathsFromExtractCrawl(metadata, extracts);
-        } else {
-            fps = getPathsFromSrcCrawl(metadata, inputDir, extracts);
-        }
-        int containerId = ID.incrementAndGet();
-        String containerIdString = Integer.toString(containerId);
-
-        ExtractReaderException.TYPE extractExceptionType = null;
-
-        List<Metadata> metadataList = null;
-        try {
-            metadataList = extractReader.loadExtract(fps.getExtractFile());
-        } catch (ExtractReaderException e) {
-            extractExceptionType = e.getType();
-        }
-
-        Map<Cols, String> contOutput = new HashMap<>();
-        Long srcFileLen = getSourceFileLength(fps, metadataList);
-        contOutput.put(Cols.LENGTH,
-                srcFileLen > NON_EXISTENT_FILE_LENGTH ?
-                        Long.toString(srcFileLen): "");
-        contOutput.put(Cols.CONTAINER_ID, containerIdString);
-        contOutput.put(Cols.FILE_PATH, fps.getRelativeSourceFilePath().toString());
-
-        if (fps.getExtractFileLength() > 0) {
-            contOutput.put(Cols.EXTRACT_FILE_LENGTH,
-                    (fps.getExtractFile() == null) ?
-                            "" :
-                    Long.toString(fps.getExtractFileLength()));
-        }
-        try {
-            writer.writeRow(CONTAINER_TABLE, contOutput);
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-
-
-        if (extractExceptionType != null) {
-            try {
-                writeExtractException(EXTRACT_EXCEPTION_TABLE, containerIdString,
-                        fps.getRelativeSourceFilePath().toString(), extractExceptionType);
-            } catch (IOException e) {
-                throw new RuntimeException(e);
-            }
-            return true;
-        }
-
-        List<Integer> numAttachments = countAttachments(metadataList);
-        int i = 0;
-        for (Metadata m : metadataList) {
-            ContentTags contentTags = getContent(fps, m);
-            //the first file should have the same id as the container id
-            String fileId = (i == 0) ? containerIdString : Integer.toString(ID.incrementAndGet());
-            writeTagData(fileId, contentTags, TAGS_TABLE);
-            writeProfileData(fps, i, contentTags, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
-            writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE);
-            writeExceptionData(fileId, m, EXCEPTION_TABLE);
-            try {
-                Map<Class, Object> textStats = calcTextStats(contentTags);
-                writeContentData(fileId, textStats, CONTENTS_TABLE);
-            } catch (IOException e) {
-                throw new RuntimeException(e);
-            }
-            i++;
-        }
-        return true;
-    }
-
-
-    private void writeEmbeddedPathData(int i, String fileId, Metadata m,
-                                       TableInfo embeddedFilePathTable) {
-        if (i == 0) {
-            return;
-        }
-        Map<Cols, String> data = new HashMap<>();
-        data.put(Cols.ID, fileId);
-        data.put(Cols.EMBEDDED_FILE_PATH,
-                m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
-        try {
-            writer.writeRow(embeddedFilePathTable, data);
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
deleted file mode 100644
index 65908b7..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval;
-
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.io.FilenameUtils;
-import org.apache.tika.Tika;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.fs.FSProperties;
-import org.apache.tika.detect.FileCommandDetector;
-import org.apache.tika.eval.db.ColInfo;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.IDBWriter;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.Types;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-
-/**
- * This class profiles actual files as opposed to extracts e.g. {@link ExtractProfiler}.
- * This does _not_ parse files, but does run file type identification and digests the
- * raw bytes.
- *
- * If the 'file' command is available on the command line, this will also run the
- * FileCommandDetector.
- */
-
-public class FileProfiler extends AbstractProfiler {
-//TODO: we should allow users to select digest type/encoding and file detector(s).
-
-    private static final boolean HAS_FILE = FileCommandDetector.checkHasFile();
-    private static final Logger LOG = LoggerFactory.getLogger(FileProfiler.class);
-
-    static Options OPTIONS;
-    static {
-
-        Option inputDir = new Option("inputDir", true,
-                "optional: directory for original binary input documents."+
-                        " If not specified, -extracts is crawled as is.");
-
-        OPTIONS = new Options()
-                .addOption(inputDir)
-                .addOption("bc", "optional: tika-batch config file")
-                .addOption("numConsumers", true, "optional: number of consumer threads")
-                .addOption("db", true, "db file to which to write results")
-                .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
-                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
-                .addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
-                .addOption("drop", false, "drop tables if they exist")
-                .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
-
-        ;
-
-    }
-
-    public static void USAGE() {
-        HelpFormatter helpFormatter = new HelpFormatter();
-        helpFormatter.printHelp(
-                80,
-                "java -jar tika-eval-x.y.jar FileProfiler -inputDir docs -db mydb [-inputDir input]",
-                "Tool: Profile",
-                FileProfiler.OPTIONS,
-                "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
-    }
-
-
-
-    public static TableInfo FILE_PROFILES = HAS_FILE ?
-            new TableInfo("file_profiles",
-                new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
-                new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
-                new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
-                new ColInfo(Cols.LENGTH, Types.BIGINT),
-                new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
-                new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER),
-                new ColInfo(Cols.FILE_MIME_ID, Types.INTEGER))
-            :
-            new TableInfo("file_profiles",
-                    new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
-                    new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
-                    new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
-                    new ColInfo(Cols.LENGTH, Types.BIGINT),
-                    new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
-                    new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER));
-
-
-    public static TableInfo FILE_MIME_TABLE = new TableInfo("file_mimes",
-            new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
-            new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
-            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
-    );
-
-    public static final String DETECT_EXCEPTION = "detect-exception";
-    private static final Tika TIKA = new Tika();
-
-    private static final FileCommandDetector FILE_COMMAND_DETECTOR = new FileCommandDetector();
-    private final Path inputDir;
-
-    public FileProfiler(ArrayBlockingQueue<FileResource> fileQueue, Path inputDir, IDBWriter dbWriter) {
-        super(fileQueue, dbWriter);
-        this.inputDir = inputDir;
-    }
-
-
-    @Override
-    public boolean processFileResource(FileResource fileResource) {
-        String relPath = fileResource.getMetadata().get(FSProperties.FS_REL_PATH);
-        try (InputStream is = fileResource.openInputStream()) {
-            try (TikaInputStream tis = TikaInputStream.get(is)) {
-                Path path = tis.getPath();
-                Map<Cols, String> data = new HashMap<>();
-                int tikaMimeId = writer.getMimeId(detectTika(tis));
-                String fileName = "";
-                String extension = "";
-                long length = -1;
-                try {
-                    fileName = FilenameUtils.getName(relPath);
-                } catch (IllegalArgumentException e) {
-                    LOG.warn("bad file name: "+relPath, e);
-                }
-
-                try {
-                    extension = FilenameUtils.getExtension(relPath);
-                } catch (IllegalArgumentException e) {
-                    LOG.warn("bad extension: "+relPath, e);
-                }
-
-                try {
-                    length = Files.size(path);
-                } catch (IOException e) {
-                    LOG.warn("problem getting size: "+relPath, e);
-                }
-
-                data.put(Cols.FILE_PATH, relPath);
-                data.put(Cols.FILE_NAME, fileName);
-                data.put(Cols.FILE_EXTENSION, extension);
-                data.put(Cols.LENGTH, Long.toString(length));
-                data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
-                data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
-                if (HAS_FILE) {
-                    int fileMimeId = writer.getMimeId(detectFile(tis));
-                    data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId));
-                }
-                writer.writeRow(FILE_PROFILES, data);
-            }
-        } catch (IOException e) {
-            //log at least!
-            return false;
-        }
-        return true;
-    }
-
-    private String detectFile(TikaInputStream tis) {
-        try {
-            return FILE_COMMAND_DETECTOR.detect(tis, new Metadata()).toString();
-        } catch (IOException e) {
-            return DETECT_EXCEPTION;
-        }
-    }
-
-    private String detectTika(TikaInputStream tis) {
-        try {
-            return TIKA.detect(tis);
-        } catch (IOException e) {
-            return DETECT_EXCEPTION;
-        }
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
deleted file mode 100644
index d678c2b..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval;
-
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.DefaultParser;
-import org.apache.commons.cli.ParseException;
-import org.apache.tika.batch.fs.FSBatchProcessCLI;
-import org.apache.tika.eval.reports.ResultsReporter;
-import org.h2.tools.Console;
-
-public class TikaEvalCLI {
-    static final String[] tools = {"Profile", "FileProfile",
-            "Compare", "Report", "StartDB"};
-
-    private static String specifyTools() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("Must specify one of the following tools in the first parameter:\n");
-        for (String s : tools) {
-            sb.append(s+"\n");
-        }
-        return sb.toString();
-
-    }
-
-    private void execute(String[] args) throws Exception {
-        String tool = args[0];
-        String[] subsetArgs = new String[args.length-1];
-        System.arraycopy(args, 1, subsetArgs, 0, args.length - 1);
-        if (tool.equals("Report")) {
-            handleReport(subsetArgs);
-        } else if (tool.equals("Compare")) {
-            handleCompare(subsetArgs);
-        } else if (tool.equals("Profile")) {
-            handleProfile(subsetArgs);
-        } else if (tool.equals("StartDB")) {
-            handleStartDB(subsetArgs);
-        } else if (tool.equals("FileProfile")){
-            handleProfileFiles(subsetArgs);
-        } else {
-            System.out.println(specifyTools());
-        }
-    }
-
-    private void handleProfileFiles(String[] subsetArgs) throws Exception {
-        List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
-
-        boolean containsBC = false;
-        String inputDir = null;
-        //confirm there's a batch-config file
-        for (int i = 0; i < argList.size(); i++) {
-            String arg = argList.get(i);
-            if (arg.equals("-bc")) {
-                containsBC = true;
-            }
-        }
-
-        Path tmpBCConfig = null;
-        try {
-            tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
-            if (! containsBC) {
-                try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-file-profiler-config.xml")) {
-                    Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
-                }
-                argList.add("-bc");
-                argList.add(tmpBCConfig.toAbsolutePath().toString());
-            }
-
-            String[] updatedArgs = argList.toArray(new String[argList.size()]);
-            DefaultParser defaultCLIParser = new DefaultParser();
-            try {
-                CommandLine commandLine = defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs);
-                if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
-                    System.out.println("Please specify either the default -db or the full -jdbc, not both");
-                    ExtractProfiler.USAGE();
-                    return;
-                }
-            } catch (ParseException e) {
-                System.out.println(e.getMessage()+"\n");
-                FileProfiler.USAGE();
-                return;
-            }
-
-            FSBatchProcessCLI.main(updatedArgs);
-        } finally {
-            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
-                Files.delete(tmpBCConfig);
-            }
-        }
-    }
-
-    private void handleStartDB(String[] args) throws SQLException {
-        List<String> argList = new ArrayList<>();
-        argList.add("-web");
-        Console.main(argList.toArray(new String[0]));
-        while(true) {
-            try {
-                Thread.sleep(1000);
-            } catch (InterruptedException e){
-                break;
-            }
-        }
-    }
-
-    private void handleProfile(String[] subsetArgs) throws Exception {
-        List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
-
-        boolean containsBC = false;
-        String inputDir = null;
-        String extracts = null;
-        String alterExtract = null;
-        //confirm there's a batch-config file
-        for (int i = 0; i < argList.size(); i++) {
-            String arg = argList.get(i);
-            if (arg.equals("-bc")) {
-                containsBC = true;
-            } else if (arg.equals("-inputDir")) {
-                if (i+1 >= argList.size()) {
-                    System.err.println("Must specify directory after -inputDir");
-                    ExtractProfiler.USAGE();
-                    return;
-                }
-                inputDir = argList.get(i+1);
-                i++;
-            } else if (arg.equals("-extracts")) {
-                if (i+1 >= argList.size()) {
-                    System.err.println("Must specify directory after -extracts");
-                    ExtractProfiler.USAGE();
-                    return;
-                }
-                extracts = argList.get(i+1);
-                i++;
-            } else if (arg.equals("-alterExtract")) {
-                if (i+1 >= argList.size()) {
-                    System.err.println("Must specify type 'as_is', 'first_only' or " +
-                            "'concatenate_content' after -alterExtract");
-                    ExtractComparer.USAGE();
-                    return;
-                }
-                alterExtract = argList.get(i+1);
-                i++;
-            }
-        }
-
-        if (alterExtract != null && !alterExtract.equals("as_is") &&
-                !alterExtract.equals("concatenate_content") &&
-                !alterExtract.equals("first_only")) {
-            System.out.println("Sorry, I don't understand:"+alterExtract+
-                    ". The values must be one of: as_is, first_only, concatenate_content");
-            ExtractProfiler.USAGE();
-            return;
-        }
-
-        //need to specify each in this commandline
-        //if only extracts is passed to tika-batch,
-        //the crawler will see no inputDir and start crawling "input".
-        //this allows the user to specify either extracts or inputDir
-        if (extracts == null && inputDir != null) {
-            argList.add("-extracts");
-            argList.add(inputDir);
-        } else if (inputDir == null && extracts != null) {
-            argList.add("-inputDir");
-            argList.add(extracts);
-        }
-
-        Path tmpBCConfig = null;
-        try {
-            tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
-            if (! containsBC) {
-                try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-profiler-config.xml")) {
-                    Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
-                }
-                argList.add("-bc");
-                argList.add(tmpBCConfig.toAbsolutePath().toString());
-            }
-
-            String[] updatedArgs = argList.toArray(new String[0]);
-            DefaultParser defaultCLIParser = new DefaultParser();
-            try {
-                CommandLine commandLine = defaultCLIParser.parse(ExtractProfiler.OPTIONS, updatedArgs);
-                if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
-                    System.out.println("Please specify either the default -db or the full -jdbc, not both");
-                    ExtractProfiler.USAGE();
-                    return;
-                }
-            } catch (ParseException e) {
-                System.out.println(e.getMessage()+"\n");
-                ExtractProfiler.USAGE();
-                return;
-            }
-
-            FSBatchProcessCLI.main(updatedArgs);
-        } finally {
-            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
-                Files.delete(tmpBCConfig);
-            }
-        }
-    }
-
-    private void handleCompare(String[] subsetArgs) throws Exception{
-        List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
-
-        boolean containsBC = false;
-        String inputDir = null;
-        String extractsA = null;
-        String alterExtract = null;
-        //confirm there's a batch-config file
-        for (int i = 0; i < argList.size(); i++) {
-            String arg = argList.get(i);
-            if (arg.equals("-bc")) {
-                containsBC = true;
-            } else if (arg.equals("-inputDir")) {
-                if (i+1 >= argList.size()) {
-                    System.err.println("Must specify directory after -inputDir");
-                    ExtractComparer.USAGE();
-                    return;
-                }
-                inputDir = argList.get(i+1);
-                i++;
-            } else if (arg.equals("-extractsA")) {
-                if (i+1 >= argList.size()) {
-                    System.err.println("Must specify directory after -extractsA");
-                    ExtractComparer.USAGE();
-                    return;
-                }
-                extractsA = argList.get(i+1);
-                i++;
-            } else if (arg.equals("-alterExtract")) {
-                if (i+1 >= argList.size()) {
-                    System.err.println("Must specify type 'as_is', 'first_only' or " +
-                            "'concatenate_content' after -alterExtract");
-                    ExtractComparer.USAGE();
-                    return;
-                }
-                alterExtract = argList.get(i+1);
-                i++;
-            }
-        }
-        if (alterExtract != null && !alterExtract.equals("as_is") &&
-                !alterExtract.equals("concatenate_content") &&
-                !alterExtract.equals("first_only")) {
-            System.out.println("Sorry, I don't understand:"+alterExtract+
-            ". The values must be one of: as_is, first_only, concatenate_content");
-            ExtractComparer.USAGE();
-            return;
-        }
-
-        //need to specify each in the commandline that goes into tika-batch
-        //if only extracts is passed to tika-batch,
-        //the crawler will see no inputDir and start crawling "input".
-        //if the user doesn't specify inputDir, crawl extractsA
-        if (inputDir == null && extractsA != null) {
-            argList.add("-inputDir");
-            argList.add(extractsA);
-        }
-
-        Path tmpBCConfig = null;
-        try {
-            tmpBCConfig = Files.createTempFile("tika-eval", ".xml");
-            if (! containsBC) {
-                try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-comparison-config.xml")) {
-                    Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
-                }
-                argList.add("-bc");
-                argList.add(tmpBCConfig.toAbsolutePath().toString());
-
-            }
-            String[] updatedArgs = argList.toArray(new String[0]);
-            DefaultParser defaultCLIParser = new DefaultParser();
-            try {
-                CommandLine commandLine = defaultCLIParser.parse(ExtractComparer.OPTIONS, updatedArgs);
-                if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
-                    System.out.println("Please specify either the default -db or the full -jdbc, not both");
-                    ExtractComparer.USAGE();
-                    return;
-                }
-            } catch (ParseException e) {
-                System.out.println(e.getMessage()+"\n");
-                ExtractComparer.USAGE();
-                return;
-            }
-
-            FSBatchProcessCLI.main(updatedArgs);
-        } finally {
-            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
-                Files.delete(tmpBCConfig);
-            }
-        }
-    }
-
-    private void handleReport(String[] subsetArgs) throws Exception {
-        ResultsReporter.main(subsetArgs);
-    }
-
-    public static void main(String[] args) throws Exception {
-        TikaEvalCLI cli = new TikaEvalCLI();
-        if (args.length == 0) {
-            System.err.println(specifyTools());
-            return;
-        }
-        cli.execute(args);
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
deleted file mode 100644
index 9ce25c2..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval;
-
-
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.stream.XMLStreamConstants;
-import javax.xml.stream.XMLStreamException;
-import javax.xml.stream.XMLStreamReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringReader;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.sql.Connection;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.log4j.Level;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.H2Util;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.XMLLogMsgHandler;
-import org.apache.tika.eval.io.XMLLogReader;
-import org.apache.tika.eval.reports.ResultsReporter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This is a very task specific class that reads a log file and updates
- * the "comparisons" table.  It should not be run in a multithreaded environment.
- */
-public class XMLErrorLogUpdater {
-    private static final Logger LOG = LoggerFactory.getLogger(ResultsReporter.class);
-
-    private Statement statement;
-
-    public static void main(String[] args) throws Exception {
-        XMLErrorLogUpdater writer = new XMLErrorLogUpdater();
-        Path xmlLogFileA = Paths.get(args[0]);
-        Path xmlLogFileB = Paths.get(args[1]);
-        Path db = Paths.get(args[2]);
-        JDBCUtil dbUtil = new H2Util(db);
-        Connection connection = dbUtil.getConnection();
-        writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, xmlLogFileA);
-        writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, xmlLogFileB);
-        connection.commit();
-    }
-
-    public void update(Connection connection, TableInfo tableInfo, Path xmlLogFile) throws Exception {
-        statement = connection.createStatement();
-        XMLLogReader reader = new XMLLogReader();
-        try (InputStream is = Files.newInputStream(xmlLogFile)) {
-            reader.read(is, new ErrorMsgUpdater(tableInfo.getName()));
-        } catch (IOException e) {
-            throw new RuntimeException("Problem reading: "+xmlLogFile.toAbsolutePath().toString());
-        } finally {
-            try {
-                connection.commit();
-                statement.close();
-            } catch (SQLException e) {
-                throw new RuntimeException("Failed to close db connection!", e);
-            }
-        }
-    }
-
-    private class ErrorMsgUpdater implements XMLLogMsgHandler {
-        private final String errorTablename;
-
-        private ErrorMsgUpdater(String errorTablename) {
-            this.errorTablename = errorTablename;
-        }
-
-        @Override
-        public void handleMsg(Level level, String xml) throws SQLException, IOException {
-            if (! level.equals(Level.ERROR)) {
-                return;
-            }
-            XMLStreamReader reader = null;
-            try {
-                reader = XMLInputFactory.newInstance().createXMLStreamReader(new StringReader(xml));
-            } catch (XMLStreamException e) {
-                throw new IOExceptionWithCause(e);
-            }
-            String type = null;
-            String resourceId = null;
-            try {
-                while (reader.hasNext() && type == null && resourceId == null) {
-                    reader.next();
-                    switch (reader.getEventType()) {
-                        case XMLStreamConstants.START_ELEMENT:
-                            if ("timed_out".equals(reader.getLocalName())) {
-                                resourceId = reader.getAttributeValue("", "resourceId");
-                                update(errorTablename, resourceId,
-                                        AbstractProfiler.PARSE_ERROR_TYPE.TIMEOUT);
-
-                            } else if ("oom".equals(reader.getLocalName())) {
-                                resourceId = reader.getAttributeValue("", "resourceId");
-                                update(errorTablename, resourceId, AbstractProfiler.PARSE_ERROR_TYPE.OOM);
-                            }
-                            break;
-                    }
-                }
-                reader.close();
-            } catch (XMLStreamException e) {
-                throw new IOExceptionWithCause(e);
-            }
-        }
-
-        private void update(String errorTableName,
-                            String filePath, AbstractProfiler.PARSE_ERROR_TYPE type) throws SQLException {
-            int containerId = getContainerId(filePath);
-            String sql = "SELECT count(1) from "+errorTableName +
-                    " where "+Cols.CONTAINER_ID +
-                    " = "+containerId + " or "+
-                    Cols.FILE_PATH + "='"+filePath+"'";
-            ResultSet rs = statement.executeQuery(sql);
-
-            //now try to figure out if that file already exists
-            //in parse errors
-            int hitCount = 0;
-            while (rs.next()) {
-                hitCount = rs.getInt(1);
-            }
-
-            //if it does, update all records matching that path or container id
-            if (hitCount > 0) {
-                sql = "UPDATE " + errorTableName +
-                        " SET " + Cols.PARSE_ERROR_ID +
-                        " = " + type.ordinal() + ","+
-                        Cols.FILE_PATH + "='" +filePath+"'"+
-                        " where "+Cols.CONTAINER_ID +
-                        "="+containerId + " or "+
-                        Cols.FILE_PATH + "='"+filePath+"'";;
-
-            } else {
-                //if not and container id > -1
-                //insert full record
-                if (containerId > -1) {
-                    sql = "INSERT INTO " + errorTableName +
-                            " ("+Cols.CONTAINER_ID+","+Cols.FILE_PATH +","+Cols.PARSE_ERROR_ID +")"+
-                            " values (" + containerId + ", '" + filePath + "'," +
-                            type.ordinal() + ");";
-                } else {
-                    //if container id == -1, insert only file path and parse error type id
-                    sql = "INSERT INTO " + errorTableName +
-                            " ("+Cols.FILE_PATH.name()+","+Cols.PARSE_ERROR_ID +")"+
-                            "values ('" + filePath + "'," +
-                            type.ordinal() + ");";
-                }
-
-            }
-            int updated = statement.executeUpdate(sql);
-            if (updated == 0) {
-                //TODO: log
-                LOG.warn("made no updates in xmlerrorlogupdater!");
-            } else if (updated > 1) {
-                LOG.warn("made too many updates");
-            }
-        }
-
-        private int getContainerId(String resourceId) throws SQLException {
-            int containerId = -1;
-            String sql = "SELECT " + Cols.CONTAINER_ID.name() +
-                    " from " + ExtractProfiler.CONTAINER_TABLE.getName()+
-                    " where " + Cols.FILE_PATH +
-                    " ='"+resourceId+"'";
-            ResultSet rs = statement.executeQuery(sql);
-            int resultCount = 0;
-            while (rs.next()) {
-                containerId = rs.getInt(1);
-                resultCount++;
-            }
-            rs.close();
-
-            if (resultCount == 0) {
-                LOG.warn("Should have found a container for: {}", resourceId);
-            } else if (resultCount > 1) {
-                LOG.error("Records ids should be unique: {}", resourceId);
-            }
-/*
-            if (containerId < 0) {
-                System.err.println("CONTAINER ID < 0!!!");
-                sql = "SELECT MAX("+ Cols.CONTAINER_ID.name() +
-                        ") from "+ExtractProfiler.CONTAINER_TABLE.getName();
-                rs = statement.executeQuery(sql);
-                while (rs.next()) {
-                    containerId = rs.getInt(1);
-                }
-                rs.close();
-                if (containerId < 0) {
-                    //log and abort
-                    //return -1?
-                } else {
-                    containerId++;
-                }
-
-            }*/
-            return containerId;
-        }
-
-
-    }
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java
deleted file mode 100644
index 2e3dfca..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.batch;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.Connection;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.batch.ConsumersManager;
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.eval.XMLErrorLogUpdater;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.eval.db.MimeBuffer;
-import org.apache.tika.eval.db.TableInfo;
-
-
-public class DBConsumersManager extends ConsumersManager {
-
-    private final Connection conn;
-    private final MimeBuffer mimeBuffer;
-    private final List<LogTablePair> errorLogs = new ArrayList<>();
-
-    public DBConsumersManager(JDBCUtil dbUtil, MimeBuffer mimeBuffer, List<FileResourceConsumer> consumers)
-            throws SQLException {
-        super(consumers);
-        this.conn = dbUtil.getConnection();
-        this.mimeBuffer = mimeBuffer;
-    }
-
-
-    @Override
-    public void shutdown() {
-
-        for (FileResourceConsumer consumer : getConsumers()) {
-            if (consumer instanceof AbstractProfiler) {
-                try{
-                    ((AbstractProfiler)consumer).closeWriter();
-                } catch (IOException e) {
-                    throw new RuntimeException(e);
-                }
-            }
-        }
-
-        try {
-            mimeBuffer.close();
-        } catch (SQLException e) {
-            throw new RuntimeException(e);
-        }
-
-        //MUST HAPPEN AFTER consumers have closed and
-        //committed container information!!!
-        XMLErrorLogUpdater up = new XMLErrorLogUpdater();
-        for (LogTablePair p : errorLogs) {
-            try {
-                up.update(conn, p.tableInfo, p.log);
-            } catch (Exception e) {
-                throw new RuntimeException(e);
-            }
-        }
-
-
-        try {
-            conn.commit();
-        } catch (SQLException e) {
-            throw new RuntimeException(e);
-        }
-        try {
-            conn.close();
-        } catch (SQLException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    public void addErrorLogTablePair(Path log, TableInfo tableInfo) {
-        LogTablePair p = new LogTablePair();
-        p.log = log;
-        p.tableInfo = tableInfo;
-        errorLogs.add(p);
-    }
-
-    class LogTablePair {
-        Path log;
-        TableInfo tableInfo;
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
deleted file mode 100644
index 9953973..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval.batch;
-
-import java.io.IOException;
-import java.sql.Connection;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.eval.db.MimeBuffer;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.DBWriter;
-import org.apache.tika.eval.io.ExtractReader;
-import org.apache.tika.eval.io.ExtractReaderException;
-import org.apache.tika.eval.io.IDBWriter;
-import org.apache.tika.util.PropsUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public abstract class EvalConsumerBuilder {
-
-    private static final Logger LOG = LoggerFactory.getLogger(EvalConsumerBuilder.class);
-
-    protected ArrayBlockingQueue<FileResource> queue;
-    Map<String, String> localAttrs;
-    JDBCUtil dbUtil;
-    private MimeBuffer mimeBuffer;
-    AtomicInteger initialized = new AtomicInteger(0);
-
-    public MimeBuffer init(ArrayBlockingQueue<FileResource> queue, Map<String, String> localAttrs,
-                     JDBCUtil dbUtil, boolean forceDrop) throws IOException, SQLException {
-        if (initialized.getAndIncrement() > 0) {
-            throw new RuntimeException("Can only init a consumer builder once!");
-        }
-        this.queue = queue;
-        this.localAttrs = localAttrs;
-        this.dbUtil = dbUtil;
-        //the order of the following is critical
-        //step 1. update the table names with prefixes
-        updateTableInfosWithPrefixes(localAttrs);
-
-        JDBCUtil.CREATE_TABLE createRegularTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS :
-                JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS;
-
-        JDBCUtil.CREATE_TABLE createRefTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS :
-                JDBCUtil.CREATE_TABLE.SKIP_IF_EXISTS;
-
-        //step 2. create the tables
-        dbUtil.createTables(getNonRefTableInfos(), createRegularTable);
-        dbUtil.createTables(getRefTableInfos(), createRefTable);
-
-        //step 3. create mime buffer
-        this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), getMimeTable(),
-                TikaConfig.getDefaultConfig());
-
-        //step 4. populate the reference tables
-        populateRefTables();
-
-        return mimeBuffer;
-    }
-
-    public abstract FileResourceConsumer build() throws IOException, SQLException;
-
-    protected abstract void updateTableInfosWithPrefixes(Map<String, String> attrs);
-
-    /**
-     *
-     * @return only the ref tables
-     */
-    protected abstract List<TableInfo> getRefTableInfos();
-
-    /**
-     *
-     * @return the main tables, not including the ref tables
-     */
-    protected abstract List<TableInfo> getNonRefTableInfos();
-
-    protected abstract TableInfo getMimeTable();
-
-    protected abstract void addErrorLogTablePairs(DBConsumersManager manager);
-
-    public void populateRefTables() throws IOException, SQLException {
-        boolean refTablesPopulated = true;
-        try{
-            Connection connection = dbUtil.getConnection();
-            for (TableInfo tableInfo : getRefTableInfos()) {
-                int rows = 0;
-                try (ResultSet rs = connection.createStatement().executeQuery("select * from "+
-                        tableInfo.getName())) {
-                    while (rs.next()) {
-                        rows++;
-                    }
-                }
-                if (rows == 0) {
-                    refTablesPopulated = false;
-                    break;
-                }
-
-            }
-        } catch (SQLException e) {
-            //swallow
-        }
-        if (refTablesPopulated) {
-            LOG.info("ref tables are already populated");
-            return;
-        }
-
-        IDBWriter writer = getDBWriter(getRefTableInfos());
-        Map<Cols, String> m = new HashMap<>();
-        for (AbstractProfiler.PARSE_ERROR_TYPE t : AbstractProfiler.PARSE_ERROR_TYPE.values()) {
-            m.clear();
-            m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
-            m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
-            writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
-        }
-
-        for (AbstractProfiler.EXCEPTION_TYPE t : AbstractProfiler.EXCEPTION_TYPE.values()) {
-            m.clear();
-            m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
-            m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
-            writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
-        }
-
-        for (ExtractReaderException.TYPE t :
-                ExtractReaderException.TYPE.values()) {
-            m.clear();
-            m.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(t.ordinal()));
-            m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
-            writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, m);
-        }
-        writer.close();
-    }
-
-    protected IDBWriter getDBWriter(List<TableInfo> tableInfos) throws IOException, SQLException {
-        Connection conn = dbUtil.getConnection();
-        return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer);
-    }
-
-    ExtractReader.ALTER_METADATA_LIST getAlterMetadata(Map<String, String> localAttrs) {
-
-        String alterExtractString = localAttrs.get("alterExtract");
-        ExtractReader.ALTER_METADATA_LIST alterExtractList;
-        if (alterExtractString == null || alterExtractString.equalsIgnoreCase("as_is")) {
-            alterExtractList = ExtractReader.ALTER_METADATA_LIST.AS_IS;
-        } else if (alterExtractString.equalsIgnoreCase("first_only")) {
-            alterExtractList = ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY;
-        } else if (alterExtractString.equalsIgnoreCase("concatenate_content")) {
-            alterExtractList = ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST;
-        } else {
-            throw new RuntimeException("options for alterExtract: as_is, first_only, concatenate_content." +
-                    " I don't understand:" + alterExtractString);
-        }
-        return alterExtractList;
-    }
-
-    protected ExtractReader buildExtractReader(Map<String, String> localAttrs) {
-        long minExtractLength = PropsUtil.getLong(localAttrs.get("minExtractLength"), -1L);
-        long maxExtractLength = PropsUtil.getLong(localAttrs.get("maxExtractLength"), -1L);
-
-        ExtractReader.ALTER_METADATA_LIST alterExtractList = getAlterMetadata(localAttrs);
-        return new ExtractReader(alterExtractList, minExtractLength, maxExtractLength);
-    }
-
-    FileResourceConsumer parameterizeProfiler(AbstractProfiler abstractProfiler) {
-
-        int maxContentLength = PropsUtil.getInt(localAttrs.get("maxContentLength"), -2);
-        if (maxContentLength > -2) {
-            abstractProfiler.setMaxContentLength(maxContentLength);
-        }
-
-        int maxContentLengthForLangId = PropsUtil.getInt(localAttrs.get("maxContentLengthForLangId"), -2);
-        if (maxContentLengthForLangId > -2) {
-            abstractProfiler.setMaxContentLengthForLangId(maxContentLengthForLangId);
-        }
-
-        int maxTokens = PropsUtil.getInt(localAttrs.get("maxTokens"), -2);
-        if (maxTokens > -2) {
-            abstractProfiler.setMaxTokens(maxTokens);
-        }
-
-
-        return abstractProfiler;
-    }
-
-
-/*
-    public abstract Map<String, String> getIndexInfo();
-
-    class ValueComparator implements Comparator<String> {
-
-        Map<String, ColInfo> map;
-
-        public ValueComparator(Map<String, ColInfo> base) {
-            this.map = base;
-        }
-
-        public int compare(String a, String b) {
-            Integer aVal = map.get(a).getDBColOffset();
-            Integer bVal = map.get(b).getDBColOffset();
-            if (aVal == null || bVal == null) {
-                throw new IllegalArgumentException("Column offset must be specified!");
-            }
-            if (aVal == bVal && ! map.get(a).equals(map.get(b))) {
-                throw new IllegalArgumentException("Column offsets must be unique: " + a + " and " + b + " both have: "+aVal);
-            }
-            if (aVal < bVal) {
-                return -1;
-            } else {
-                return 1;
-            }
-        }
-    }
-*/
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
deleted file mode 100644
index 62cf348..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.batch;
-
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.sql.SQLException;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-
-import org.apache.tika.batch.ConsumersManager;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.batch.builders.AbstractConsumersBuilder;
-import org.apache.tika.batch.builders.BatchProcessBuilder;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.eval.db.H2Util;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.eval.db.MimeBuffer;
-import org.apache.tika.eval.langid.LanguageIDWrapper;
-import org.apache.tika.util.ClassLoaderUtil;
-import org.apache.tika.util.PropsUtil;
-import org.apache.tika.util.XMLDOMUtil;
-import org.w3c.dom.Node;
-
-public class EvalConsumersBuilder extends AbstractConsumersBuilder {
-
-    @Override
-    public ConsumersManager build(Node node, Map<String, String> runtimeAttributes,
-                                  ArrayBlockingQueue<FileResource> queue) {
-
-        List<FileResourceConsumer> consumers = new LinkedList<>();
-        int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
-
-        Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
-
-
-        Path db = getPath(localAttrs, "db");
-        String jdbcConnectionString = localAttrs.get("jdbc");
-
-
-        Path commonTokens = getPath(localAttrs, "commonTokens");
-        String defaultLangCode = localAttrs.get("defaultLangCode");
-        if (defaultLangCode == null) {
-            defaultLangCode = "";
-        }
-        //can be null, in which case will load from memory
-        try {
-            AbstractProfiler.loadCommonTokens(commonTokens, defaultLangCode);
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-
-        JDBCUtil jdbcUtil = null;
-        if (db != null) {
-            jdbcUtil = new H2Util(db);
-        } else if (jdbcConnectionString != null) {
-            jdbcUtil = new JDBCUtil(jdbcConnectionString, localAttrs.get("jdbcDriver"));
-        } else {
-            throw new RuntimeException("Must specify: -db or -jdbc");
-        }
-        EvalConsumerBuilder consumerBuilder = ClassLoaderUtil.buildClass(EvalConsumerBuilder.class,
-                PropsUtil.getString(localAttrs.get("consumerBuilderClass"), null));
-        if (consumerBuilder == null) {
-            throw new RuntimeException("Must specify consumerBuilderClass in config file");
-        }
-
-        boolean forceDrop = PropsUtil.getBoolean(localAttrs.get("drop"), false);
-        MimeBuffer mimeBuffer = null;
-        try {
-            mimeBuffer = consumerBuilder.init(queue, localAttrs, jdbcUtil, forceDrop);
-        } catch (IOException | SQLException e) {
-            throw new RuntimeException(e);
-        }
-
-        for (int i = 0; i < numConsumers; i++) {
-            try {
-                consumers.add(consumerBuilder.build());
-            } catch (IOException | SQLException e) {
-                throw new RuntimeException(e);
-            }
-        }
-
-        DBConsumersManager manager;
-        try {
-            manager = new DBConsumersManager(jdbcUtil, mimeBuffer, consumers);
-        } catch (SQLException e) {
-            throw new RuntimeException(e);
-        }
-        consumerBuilder.addErrorLogTablePairs(manager);
-
-        return manager;
-    }
-
-    private Path getNonNullPath(Map<String, String> attrs, String key) {
-        Path p = getPath(attrs, key);
-        if (p == null) {
-            throw new RuntimeException("Must specify a file for this attribute: "+key);
-        }
-        return p;
-    }
-
-
-    protected Path getPath(Map<String, String> attrs, String key) {
-        String filePath = attrs.get(key);
-        if (filePath == null) {
-            return null;
-        }
-        return Paths.get(filePath);
-    }
-
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
deleted file mode 100644
index 0ae893f..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.batch;
-
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.eval.ExtractComparer;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-public class ExtractComparerBuilder extends EvalConsumerBuilder {
-    public final static String TABLE_PREFIX_A_KEY = "tablePrefixA";
-    public final static String TABLE_PREFIX_B_KEY = "tablePrefixB";
-
-    private final List<TableInfo> tableInfosA;
-    private final List<TableInfo> tableInfosB;
-    private final List<TableInfo> tableInfosAandB;
-    private final List<TableInfo> refTableInfos;
-
-    public ExtractComparerBuilder() {
-        List<TableInfo> tableInfosA = new ArrayList<>();
-        List<TableInfo> tableInfosB = new ArrayList<>();
-        List<TableInfo> tableInfosAandB = new ArrayList<>();
-        tableInfosA.add(ExtractComparer.PROFILES_A);
-        tableInfosA.add(ExtractComparer.EXCEPTION_TABLE_A);
-        tableInfosA.add(ExtractComparer.TAGS_TABLE_A);
-        tableInfosA.add(ExtractComparer.CONTENTS_TABLE_A);
-        tableInfosA.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
-        tableInfosA.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A);
-
-        tableInfosB.add(ExtractComparer.PROFILES_B);
-        tableInfosB.add(ExtractComparer.EXCEPTION_TABLE_B);
-        tableInfosB.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
-        tableInfosB.add(ExtractComparer.TAGS_TABLE_B);
-        tableInfosB.add(ExtractComparer.CONTENTS_TABLE_B);
-        tableInfosB.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B);
-
-        tableInfosAandB.add(ExtractComparer.COMPARISON_CONTAINERS);
-        tableInfosAandB.add(ExtractComparer.CONTENT_COMPARISONS);
-        tableInfosAandB.add(AbstractProfiler.MIME_TABLE);
-
-        List<TableInfo> refTableInfos = new ArrayList<>();
-        refTableInfos.add(ExtractComparer.REF_PAIR_NAMES);
-        refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
-        refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
-        refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
-
-        this.tableInfosA = Collections.unmodifiableList(tableInfosA);
-        this.tableInfosB = Collections.unmodifiableList(tableInfosB);
-        this.tableInfosAandB = Collections.unmodifiableList(tableInfosAandB);
-        this.refTableInfos = Collections.unmodifiableList(refTableInfos);
-    }
-
-    @Override
-    public FileResourceConsumer build() throws IOException, SQLException {
-        Path extractsA = PropsUtil.getPath(localAttrs.get("extractsA"), null);
-        if (extractsA == null) {
-            throw new RuntimeException("Must specify \"extractsA\" -- directory for 'A' extracts");
-        }
-        Path extractsB = PropsUtil.getPath(localAttrs.get("extractsB"), null);
-        if (extractsB == null) {
-            throw new RuntimeException("Must specify \"extractsB\" -- directory for 'B' extracts");
-        }
-
-        Path inputRootDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
-
-        if (inputRootDir == null) {
-            //this is for the sake of the crawler
-            throw new RuntimeException("Must specify an -inputDir");
-        }
-
-        return parameterizeProfiler(new ExtractComparer(queue, inputRootDir, extractsA, extractsB,
-                buildExtractReader(localAttrs),
-                getDBWriter(getNonRefTableInfos())));
-    }
-
-
-    @Override
-    protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
-        String tablePrefixA = localAttrs.get(TABLE_PREFIX_A_KEY);
-
-        String tablePrefixB = localAttrs.get(TABLE_PREFIX_B_KEY);
-
-        tablePrefixA = (tablePrefixA == null || tablePrefixA.endsWith("_")) ? tablePrefixA : tablePrefixA+"_";
-        tablePrefixB = (tablePrefixB == null || tablePrefixB.endsWith("_")) ? tablePrefixB : tablePrefixB+"_";
-
-        if (tablePrefixA != null) {
-            for (TableInfo tableInfo : tableInfosA) {
-                tableInfo.setNamePrefix(tablePrefixA);
-            }
-        }
-
-        if (tablePrefixB != null) {
-            for (TableInfo tableInfo : tableInfosB) {
-                tableInfo.setNamePrefix(tablePrefixB);
-            }
-        }
-
-        if (tablePrefixA != null || tablePrefixB != null) {
-            String aAndB = (tablePrefixA == null) ? "" : tablePrefixA;
-            aAndB = (tablePrefixB == null) ? aAndB : aAndB+tablePrefixB;
-            for (TableInfo tableInfo : tableInfosAandB) {
-                tableInfo.setNamePrefix(aAndB);
-            }
-        }
-    }
-
-    @Override
-    protected List<TableInfo> getRefTableInfos() {
-        return refTableInfos;
-    }
-
-    @Override
-    protected List<TableInfo> getNonRefTableInfos() {
-        List<TableInfo> allNonRefTables = new ArrayList<>();
-        allNonRefTables.addAll(tableInfosA);
-        allNonRefTables.addAll(tableInfosB);
-        allNonRefTables.addAll(tableInfosAandB);
-        return Collections.unmodifiableList(allNonRefTables);
-    }
-
-    @Override
-    protected TableInfo getMimeTable() {
-        return AbstractProfiler.MIME_TABLE;
-    }
-
-    @Override
-    protected void addErrorLogTablePairs(DBConsumersManager manager) {
-        Path errorLogA = PropsUtil.getPath(localAttrs.get("errorLogFileA"), null);
-        if (errorLogA == null) {
-            return;
-        }
-        manager.addErrorLogTablePair(errorLogA, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
-        Path errorLogB = PropsUtil.getPath(localAttrs.get("errorLogFileB"), null);
-        if (errorLogB == null) {
-            return;
-        }
-        manager.addErrorLogTablePair(errorLogB, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
-
-    }
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
deleted file mode 100644
index 20efbf7..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.batch;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.eval.ExtractProfiler;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-
-public class ExtractProfilerBuilder extends EvalConsumerBuilder {
-
-    public final static String TABLE_PREFIX_KEY = "tablePrefix";
-
-    private final List<TableInfo> tableInfos;
-    private final List<TableInfo> refTableInfos;
-    public ExtractProfilerBuilder() {
-        List<TableInfo> tableInfos = new ArrayList();
-        tableInfos.add(AbstractProfiler.MIME_TABLE);
-        tableInfos.add(ExtractProfiler.CONTAINER_TABLE);
-        tableInfos.add(ExtractProfiler.PROFILE_TABLE);
-        tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-        tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
-        tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
-        tableInfos.add(ExtractProfiler.TAGS_TABLE);
-        tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
-        this.tableInfos = Collections.unmodifiableList(tableInfos);
-
-        List<TableInfo> refTableInfos = new ArrayList<>();
-        refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
-        refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
-        refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
-        this.refTableInfos = Collections.unmodifiableList(refTableInfos);
-    }
-
-    @Override
-    public FileResourceConsumer build() throws IOException, SQLException {
-        Path extracts = PropsUtil.getPath(localAttrs.get("extracts"), null);
-        if (extracts == null) {
-            throw new RuntimeException("Must specify \"extracts\" -- directory to crawl");
-        }
-        if (!Files.isDirectory(extracts)) {
-            throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " +
-                    extracts.toAbsolutePath());
-        }
-
-        Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
-
-        //we _could_ set this to extracts (if not null)
-        //here, but the Crawler defaults to "input" if nothing is passed
-        //so this won't work
-        if (inputDir == null) {
-            throw new RuntimeException("Must specify -inputDir");
-        }
-        if (extracts == null && inputDir != null) {
-            extracts = inputDir;
-        }
-        return parameterizeProfiler(new ExtractProfiler(queue, inputDir, extracts,
-                buildExtractReader(localAttrs),
-                getDBWriter(tableInfos)));
-    }
-
-
-    @Override
-    protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
-        String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
-        if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
-            for (TableInfo tableInfo : tableInfos) {
-                tableInfo.setNamePrefix(tableNamePrefix);
-            }
-        }
-    }
-
-
-    @Override
-    protected List<TableInfo> getRefTableInfos() {
-        return refTableInfos;
-    }
-
-    @Override
-    protected List<TableInfo> getNonRefTableInfos() {
-        return tableInfos;
-    }
-
-    @Override
-    protected TableInfo getMimeTable() {
-        return AbstractProfiler.MIME_TABLE;
-    }
-
-    @Override
-    protected void addErrorLogTablePairs(DBConsumersManager manager) {
-        Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
-        if (errorLog == null) {
-            return;
-        }
-        manager.addErrorLogTablePair(errorLog, ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
deleted file mode 100644
index a54dd55..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.batch;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.ExtractProfiler;
-import org.apache.tika.eval.FileProfiler;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-
-public class FileProfilerBuilder extends EvalConsumerBuilder {
-
-    public final static String TABLE_PREFIX_KEY = "tablePrefix";
-
-    private final List<TableInfo> tableInfos;
-
-    public FileProfilerBuilder() {
-        List<TableInfo> tableInfos = new ArrayList();
-        tableInfos.add(FileProfiler.FILE_MIME_TABLE);
-        tableInfos.add(FileProfiler.FILE_PROFILES);
-        this.tableInfos = Collections.unmodifiableList(tableInfos);
-
-    }
-
-    @Override
-    public FileResourceConsumer build() throws IOException, SQLException {
-
-        Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
-
-        //we _could_ set this to extracts (if not null)
-        //here, but the Crawler defaults to "input" if nothing is passed
-        //so this won't work
-        if (inputDir == null) {
-            throw new RuntimeException("Must specify -inputDir");
-        }
-        return parameterizeProfiler(new FileProfiler(queue, inputDir,
-                getDBWriter(tableInfos)));
-    }
-
-
-    @Override
-    protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
-        String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
-        if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
-            for (TableInfo tableInfo : tableInfos) {
-                tableInfo.setNamePrefix(tableNamePrefix);
-            }
-        }
-    }
-
-    @Override
-    protected List<TableInfo> getRefTableInfos() {
-        return Collections.EMPTY_LIST;
-    }
-
-    @Override
-    protected List<TableInfo> getNonRefTableInfos() {
-        return tableInfos;
-    }
-
-    @Override
-    protected TableInfo getMimeTable() {
-        return FileProfiler.FILE_MIME_TABLE;
-    }
-
-    @Override
-    protected void addErrorLogTablePairs(DBConsumersManager manager) {
-        Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
-        if (errorLog == null) {
-            return;
-        }
-        manager.addErrorLogTablePair(errorLog, ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java
deleted file mode 100644
index baa7994..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.locks.Lock;
-import java.util.concurrent.locks.ReentrantReadWriteLock;
-
-
-/**
- * Abstract buffer for map of values and unique ids.
- * <p>
- * Use this for fast in memory lookups of smallish sets of values.
- *
- */
-abstract class AbstractDBBuffer {
-
-    private final Map<String, Integer> m = new HashMap<>();
-    private final ReentrantReadWriteLock rwl = new ReentrantReadWriteLock();
-    private final Lock r = rwl.readLock();
-    private final Lock w = rwl.writeLock();
-
-    private int numWrites = 0;
-
-    public int getId(String key) {
-        r.lock();
-        try {
-            Integer v = m.get(key);
-            if (v != null) {
-                return v;
-            }
-        } finally {
-            r.unlock();
-        }
-
-        try {
-            w.lock();
-            Integer v = m.get(key);
-            if (v != null) {
-                return v;
-            }
-            v = m.size()+1;
-            m.put(key, v);
-            write(v, key);
-            numWrites++;
-            return v;
-        } finally {
-            w.unlock();
-        }
-    }
-
-    public int getNumWrites() {
-        return numWrites;
-    }
-
-    //Odd to throw RuntimeException, I know.  It should be
-    //catastrophic if this buffer can't write to the db.
-    public abstract void write(int id, String value) throws RuntimeException;
-
-    public abstract void close() throws SQLException;
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java b/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java
deleted file mode 100644
index 369cb0a..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-
-import java.sql.Types;
-import java.util.Objects;
-
-public class ColInfo {
-    private final Cols name;
-    private final int type;
-    private final Integer precision;
-    private final String constraints;
-
-    public ColInfo(Cols name, int type) {
-        this(name, type, null, null);
-    }
-
-    public ColInfo(Cols name, int type, String constraints) {
-        this(name, type, null, constraints);
-    }
-
-    public ColInfo(Cols name, int type, Integer precision) {
-        this(name, type, precision, null);
-    }
-
-
-    public ColInfo(Cols name, int type, Integer precision, String constraints) {
-        this.name = name;
-        this.type = type;
-        this.precision = precision;
-        this.constraints = constraints;
-    }
-
-    public int getType() {
-        return type;
-    }
-
-    public Cols getName() {
-        return name;
-    }
-    /**
-     *
-     * @return constraints string or null
-     */
-    public String getConstraints() {
-        return constraints;
-    }
-
-    /**
-     * Gets the precision.  This can be null!
-     * @return precision or null
-     */
-    public Integer getPrecision() {
-        return precision;
-    }
-
-    public String getSqlDef() {
-        if (type == Types.VARCHAR){
-            return "VARCHAR("+precision+")";
-        } else if (type == Types.CHAR) {
-            return "CHAR("+precision+")";
-        }
-        switch (type) {
-            case Types.FLOAT :
-                return "FLOAT";
-            case Types.DOUBLE :
-                return "DOUBLE";
-            case Types.BLOB :
-                return "BLOB";
-            case Types.INTEGER :
-                return "INTEGER";
-            case Types.BIGINT :
-                return "BIGINT";
-            case Types.BOOLEAN :
-                return "BOOLEAN";
-        }
-        throw new UnsupportedOperationException("Don't yet recognize a type for: "+type);
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) {
-            return true;
-        }
-        if (o == null || getClass() != o.getClass()) {
-            return false;
-        }
-
-        ColInfo colInfo = (ColInfo) o;
-
-        if (type != colInfo.type) {
-            return false;
-        }
-        if (name != colInfo.name) {
-            return false;
-        }
-        if (!Objects.equals(precision, colInfo.precision)) {
-            return false;
-        }
-        return Objects.equals(constraints, colInfo.constraints);
-
-    }
-
-    @Override
-    public int hashCode() {
-        int result = name != null ? name.hashCode() : 0;
-        result = 31 * result + type;
-        result = 31 * result + (precision != null ? precision.hashCode() : 0);
-        result = 31 * result + (constraints != null ? constraints.hashCode() : 0);
-        return result;
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
deleted file mode 100644
index f0e0955..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-import java.sql.Types;
-
-public enum Cols {
-    //container table
-    CONTAINER_ID,
-    FILE_PATH,
-    EXTRACT_FILE_LENGTH,
-
-    EXTRACT_FILE_LENGTH_A, //for comparisons
-    EXTRACT_FILE_LENGTH_B,
-
-    //profile table
-    ID,
-    LENGTH,
-    FILE_NAME,
-    FILE_EXTENSION,
-    ELAPSED_TIME_MILLIS,
-    NUM_METADATA_VALUES,
-    IS_EMBEDDED,
-    EMBEDDED_FILE_PATH,
-    MIME_ID,
-    TIKA_MIME_ID,
-    FILE_MIME_ID,
-    SHA256,
-    MD5,
-    NUM_ATTACHMENTS,
-    HAS_CONTENT,
-
-    //content
-    CONTENT_LENGTH,
-    NUM_UNIQUE_TOKENS,
-    NUM_TOKENS,
-    NUM_UNIQUE_ALPHABETIC_TOKENS,
-    NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens
-    COMMON_TOKENS_LANG, //which language was used for the common tokens metric?
-    NUM_UNIQUE_COMMON_TOKENS,
-    NUM_COMMON_TOKENS,
-    TOP_N_TOKENS,
-    LANG_ID_1,
-    LANG_ID_PROB_1,
-    LANG_ID_2,
-    LANG_ID_PROB_2,
-    TOKEN_ENTROPY_RATE,
-    TOKEN_LENGTH_SUM,
-    TOKEN_LENGTH_MEAN,
-    TOKEN_LENGTH_STD_DEV,
-    UNICODE_CHAR_BLOCKS,
-    NUM_PAGES, //number of pages a document alleges it has
-    CONTENT_TRUNCATED_AT_MAX_LEN, // was the string truncated at AbstractProfiler.MAX_STRING_LENGTH
-
-    //content comparisons
-    TOP_10_UNIQUE_TOKEN_DIFFS_A,
-    TOP_10_UNIQUE_TOKEN_DIFFS_B,
-    TOP_10_MORE_IN_A,
-    TOP_10_MORE_IN_B,
-    OVERLAP,
-    DICE_COEFFICIENT,
-
-    //errors
-    PARSE_ERROR_ID,
-
-    PARSE_ERROR_DESCRIPTION,
-    PARSE_EXCEPTION_DESCRIPTION,
-
-    EXTRACT_EXCEPTION_ID,
-    EXTRACT_EXCEPTION_DESCRIPTION,
-
-
-    //exceptions
-    ORIG_STACK_TRACE,
-    SORT_STACK_TRACE,
-    PARSE_EXCEPTION_ID,
-
-
-    MIME_STRING,//string representation of mime type
-
-    DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
-    DIR_NAME_B,
-    
-    //structure tags
-    TAGS_A,
-    TAGS_B,
-    TAGS_DIV,
-    TAGS_I,
-    TAGS_IMG,
-    TAGS_LI,
-    TAGS_P,
-    TAGS_OL,
-    TAGS_TABLE,
-    TAGS_TD,
-    TAGS_TITLE,
-    TAGS_TR,
-    TAGS_UL,
-    TAGS_U,
-    TAGS_PARSE_EXCEPTION, //if there was a SAX|IO|TikaException while parsing the html or xhtml
-    ;
-
-}
-
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java
deleted file mode 100644
index 8d936c2..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-
-public class DBBuffer extends AbstractDBBuffer {
-
-    private final PreparedStatement st;
-
-    public DBBuffer(Connection connection, String tableName,
-                    String idColumnName, String valueColumnName) throws SQLException {
-        st = connection.prepareStatement("insert into "+tableName+ "( "+
-                idColumnName + ", " + valueColumnName+") values (?,?);");
-    }
-
-    @Override
-    public void write(int id, String value) throws RuntimeException {
-        try {
-            st.clearParameters();
-            st.setInt(1, id);
-            st.setString(2, value);
-            st.execute();
-
-        } catch (SQLException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    @Override
-    public void close() throws SQLException {
-        st.close();
-
-    }
-
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java b/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java
deleted file mode 100644
index 14037be..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-import java.nio.file.Path;
-import java.sql.Connection;
-import java.sql.DriverManager;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.io.FilenameUtils;
-
-
-public class H2Util extends JDBCUtil {
-
-    private final static String DRIVER_CLASS = "org.h2.Driver";
-
-    private final Path db;
-
-    public H2Util(Path db) {
-        super(getConnectionString(db, true), DRIVER_CLASS);
-        this.db = db;
-    }
-
-    public static boolean databaseExists(Path db) {
-        String connString = getConnectionString(db, false);
-        try (Connection conn = DriverManager.getConnection(connString)) {
-        } catch (SQLException e) {
-            return false;
-        }
-        return true;
-    }
-    @Override
-    public String getJDBCDriverClass() {
-        return "org.h2.Driver";
-    }
-
-    @Override
-    public boolean dropTableIfExists(Connection conn, String tableName) throws SQLException {
-        Statement st = conn.createStatement();
-        String sql = "drop table if exists "+tableName;
-        boolean success = st.execute(sql);
-        st.close();
-        return success;
-    }
-
-    @Override
-    public String getConnectionString() {
-        return getConnectionString(db, true);
-    }
-
-    private static String getConnectionString(Path db, boolean createDBIfItDoesntExist) {
-        String s = "jdbc:h2:"+ FilenameUtils.separatorsToUnix(db.toAbsolutePath().toString());
-        if (! createDBIfItDoesntExist) {
-            s += ";IFEXISTS=TRUE";
-        }
-        return s;
-    }
-
-    @Override
-    public Set<String> getTables(Connection connection) throws SQLException {
-        String sql = "SHOW TABLES";
-        Statement st = connection.createStatement();
-        ResultSet rs = st.executeQuery(sql);
-        Set<String> tables = new HashSet<>();
-        while (rs.next()) {
-            String table = rs.getString(1);
-            tables.add(table);
-        }
-        return tables;
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java b/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
deleted file mode 100644
index 33f1279..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
+++ /dev/null
@@ -1,316 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval.db;
-
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.sql.Connection;
-import java.sql.DatabaseMetaData;
-import java.sql.DriverManager;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.sql.Types;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class JDBCUtil {
-    private static final Logger LOG = LoggerFactory.getLogger(JDBCUtil.class);
-
-    public enum CREATE_TABLE {
-        DROP_IF_EXISTS,
-        SKIP_IF_EXISTS,
-        THROW_EX_IF_EXISTS,
-    }
-
-    private final String connectionString;
-    private String driverClass;
-    private Connection connection = null;
-
-    public JDBCUtil(String connectionString, String driverClass) {
-        this.connectionString = connectionString;
-        this.driverClass = driverClass;
-        if (driverClass == null || driverClass.length() == 0) {
-            if (System.getProperty("jdbc.drivers") != null) {
-                //user has specified it on the command line
-                //stop now
-            } else {
-                //try to use the mappings in db.properties to determine the class
-                try (InputStream is = JDBCUtil.class.getResourceAsStream("/db.properties")) {
-                    Properties properties = new Properties();
-                    properties.load(is);
-                    for (String k : properties.stringPropertyNames()) {
-                        Matcher m = Pattern.compile("(?i)jdbc:" + k).matcher(connectionString);
-                        if (m.find()) {
-                            this.driverClass = properties.getProperty(k);
-                        }
-                    }
-
-                } catch (IOException e) {
-
-                }
-            }
-        }
-    }
-
-    /**
-     * Override this any optimizations you want to do on the db
-     * before writing/reading.
-     *
-     * @return
-     * @throws IOException
-     */
-    public Connection getConnection() throws SQLException {
-        if (connection != null) {
-            return connection;
-        }
-        String connectionString = getConnectionString();
-        String jdbcDriver = getJDBCDriverClass();
-        if (jdbcDriver != null) {
-            try {
-                Class.forName(getJDBCDriverClass());
-            } catch (ClassNotFoundException e) {
-                throw new RuntimeException(e);
-            }
-        }
-        connection = DriverManager.getConnection(connectionString);
-        connection.setAutoCommit(false);
-
-        return connection;
-    }
-
-    /**
-     * JDBC driver class.  Override as necessary.
-     *
-     * @return
-     */
-    public String getJDBCDriverClass() {
-        return driverClass;
-    }
-
-
-    public boolean dropTableIfExists(Connection conn, String tableName) throws SQLException {
-        if (containsTable(tableName)) {
-            try (Statement st = conn.createStatement()) {
-                String sql = "drop table " + tableName;
-                return st.execute(sql);
-            }
-        }
-        return true;
-    }
-
-
-    public String getConnectionString() {
-        return connectionString;
-    }
-
-
-    public Set<String> getTables(Connection connection) throws SQLException {
-        Set<String> tables = new HashSet<>();
-
-        DatabaseMetaData dbMeta = connection.getMetaData();
-
-        try (ResultSet rs = dbMeta.getTables(null, null, "%", null)) {
-            while (rs.next()) {
-                tables.add(rs.getString(3).toLowerCase(Locale.US));
-            }
-        }
-        return tables;
-    }
-
-    @Deprecated
-    /**
-     * @deprecated use {@link #batchInsert(PreparedStatement, TableInfo, Map)}
-     */
-    public static int insert(PreparedStatement insertStatement,
-                             TableInfo table,
-                             Map<Cols, String> data) throws SQLException {
-
-        //clear parameters before setting
-        insertStatement.clearParameters();
-        try {
-            int i = 1;
-            for (ColInfo colInfo : table.getColInfos()) {
-                updateInsertStatement(i, insertStatement, colInfo, data.get(colInfo.getName()));
-                i++;
-            }
-            for (Cols c : data.keySet()) {
-                if (!table.containsColumn(c)) {
-                    throw new IllegalArgumentException("Can't add data to " + c +
-                            " because it doesn't exist in the table: " + table.getName());
-                }
-            }
-            return insertStatement.executeUpdate();
-        } catch (SQLException e) {
-            LOG.warn("couldn't insert data for this row: {}", e.getMessage());
-            return -1;
-        }
-    }
-
-    public static void batchInsert(PreparedStatement insertStatement,
-                                   TableInfo table,
-                                   Map<Cols, String> data) throws SQLException {
-
-        try {
-            int i = 1;
-            for (ColInfo colInfo : table.getColInfos()) {
-                updateInsertStatement(i, insertStatement, colInfo, data.get(colInfo.getName()));
-                i++;
-            }
-            for (Cols c : data.keySet()) {
-                if (!table.containsColumn(c)) {
-                    throw new IllegalArgumentException("Can't add data to " + c +
-                            " because it doesn't exist in the table: " + table.getName());
-                }
-            }
-            insertStatement.addBatch();
-        } catch (SQLException e) {
-            LOG.warn("couldn't insert data for this row: {}", e.getMessage());
-        }
-    }
-
-    public static void updateInsertStatement(int dbColOffset, PreparedStatement st,
-                                             ColInfo colInfo, String value) throws SQLException {
-        if (value == null) {
-            st.setNull(dbColOffset, colInfo.getType());
-            return;
-        }
-        try {
-            switch (colInfo.getType()) {
-                case Types.VARCHAR:
-                    if (value != null && value.length() > colInfo.getPrecision()) {
-                        value = value.substring(0, colInfo.getPrecision());
-                        LOG.warn("truncated varchar value in {} : {}", colInfo.getName(), value);
-                    }
-                    //postgres doesn't allow \0000
-                    value = value.replaceAll("\u0000", " ");
-                    st.setString(dbColOffset, value);
-                    break;
-                case Types.CHAR:
-                    //postgres doesn't allow \0000
-                    value = value.replaceAll("\u0000", " ");
-                    st.setString(dbColOffset, value);
-                    break;
-                case Types.DOUBLE:
-                    st.setDouble(dbColOffset, Double.parseDouble(value));
-                    break;
-                case Types.FLOAT:
-                    st.setDouble(dbColOffset, Float.parseFloat(value));
-                    break;
-                case Types.INTEGER:
-                    st.setInt(dbColOffset, Integer.parseInt(value));
-                    break;
-                case Types.BIGINT:
-                    st.setLong(dbColOffset, Long.parseLong(value));
-                    break;
-                case Types.BOOLEAN:
-                    st.setBoolean(dbColOffset, Boolean.parseBoolean(value));
-                    break;
-                default:
-                    throw new UnsupportedOperationException("Don't yet support type: " + colInfo.getType());
-            }
-        } catch (NumberFormatException e) {
-            if (!"".equals(value)) {
-                LOG.warn("number format exception: {} : {}", colInfo.getName(), value);
-            }
-            st.setNull(dbColOffset, colInfo.getType());
-        } catch (SQLException e) {
-            LOG.warn("sqlexception: {} : {}", colInfo, value);
-            st.setNull(dbColOffset, colInfo.getType());
-        }
-    }
-
-    public void createTables(List<TableInfo> tableInfos, CREATE_TABLE createTable) throws SQLException, IOException {
-
-        Connection conn = getConnection();
-        for (TableInfo tableInfo : tableInfos) {
-
-            if (createTable.equals(CREATE_TABLE.DROP_IF_EXISTS)) {
-                dropTableIfExists(conn, tableInfo.getName());
-            } else if (createTable.equals(CREATE_TABLE.SKIP_IF_EXISTS)) {
-                if (containsTable(tableInfo.getName())) {
-                    continue;
-                }
-            }
-            createTable(conn, tableInfo);
-        }
-        conn.commit();
-
-    }
-
-    public boolean containsTable(String tableName) throws SQLException {
-        Connection connection = getConnection();
-        Set<String> tables = getTables(connection);
-        if (tables.contains(normalizeTableName(tableName))) {
-            return true;
-        }
-
-        return false;
-    }
-
-    /**
-     * Override for custom behavior
-     *
-     * @param tableName
-     * @return
-     */
-    String normalizeTableName(String tableName) {
-        tableName = tableName.toLowerCase(Locale.US);
-        return tableName;
-    }
-
-    //does not close the connection
-    private void createTable(Connection conn, TableInfo tableInfo) throws SQLException {
-        StringBuilder createSql = new StringBuilder();
-        createSql.append("CREATE TABLE " + tableInfo.getName());
-        createSql.append("(");
-
-        int last = 0;
-        for (ColInfo col : tableInfo.getColInfos()) {
-            last++;
-            if (last > 1) {
-                createSql.append(", ");
-            }
-            createSql.append(col.getName());
-            createSql.append(" ");
-            createSql.append(col.getSqlDef());
-            String constraints = col.getConstraints();
-            if (constraints != null) {
-                createSql.append(" ");
-                createSql.append(constraints);
-            }
-        }
-        createSql.append(")");
-        try (Statement st = conn.createStatement()) {
-            st.execute(createSql.toString());
-            st.close();
-        }
-        conn.commit();
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
deleted file mode 100644
index eba731b..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-import java.sql.Types;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeType;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeTypes;
-
-
-public class MimeBuffer extends AbstractDBBuffer {
-
-    private final PreparedStatement st;
-    private final TikaConfig config;
-    private final Connection connection;
-
-
-    public MimeBuffer(Connection connection, TableInfo mimeTable, TikaConfig config) throws SQLException {
-        st = connection.prepareStatement("insert into " + mimeTable.getName() + "( " +
-                Cols.MIME_ID.name() + ", " +
-                Cols.MIME_STRING.name() + ", " +
-                Cols.FILE_EXTENSION.name() + ") values (?,?,?)");
-        this.config = config;
-        this.connection = connection;
-    }
-
-    @Override
-    public void write(int id, String value) throws RuntimeException {
-        try {
-            st.clearParameters();
-            st.setInt(1, id);
-            st.setString(2, value);
-            try {
-                String ext = MimeUtil.getExtension(value, config);
-                if (ext == null || ext.length() == 0) {
-                    st.setNull(3, Types.VARCHAR);
-                } else {
-                    st.setString(3, ext);
-                }
-            } catch (MimeTypeException e) {
-                st.setNull(3, Types.VARCHAR);
-            }
-            st.execute();
-
-        } catch (SQLException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    @Override
-    public void close() throws SQLException {
-        st.close();
-        connection.commit();
-    }
-
-    private static class MimeUtil {
-        //TODO: see if MimeType now works for these
-        private static final String APPLICATION = "application";
-        private static final String TEXT = "text";
-        private static final String HTML = "html";
-        private static final String XML = "xml";
-        private static final String XHTML_XML = "xhtml+xml";
-        private static final String CSS = "css";
-        private static final String CSV = "csv";
-        private static final String PLAIN = "plain";
-        private static final String EMPTY_STRING = "";
-
-        /**
-         * Utility method to convert from a string value representing a content type
-         * (e.g. "application/pdf") into the most common extension for that file type
-         * (e.g. "pdf").
-         * <p>
-         * This will has special handling for texty filetypes whose MimeTypes
-         * don't currently return anything for {@link MimeType#getExtension};
-         *
-         * @param contentType string representing a content type, for example: "application/pdf"
-         * @param config      config from which to get MimeRepository
-         * @return extension or empty string
-         * @throws MimeTypeException thrown if MimeTypes can't parse the contentType
-         */
-        public static String getExtension(String contentType, TikaConfig config)
-                throws MimeTypeException {
-            MimeTypes types = config.getMimeRepository();
-            MimeType mime = types.forName(contentType);
-            return getExtension(mime);
-        }
-
-        public static String getExtension(MimeType mime) {
-
-            String ext = mime.getExtension();
-            if (ext.startsWith(".")) {
-                ext = ext.substring(1);
-            }
-
-            //special handling for text/html/xml
-            if (ext.length() == 0) {
-                ext = tryTextyTypes(mime.getType());
-            }
-            return ext;
-        }
-
-        private static String tryTextyTypes(MediaType mediaType) {
-
-            String type = mediaType.getType();
-            String subtype = mediaType.getSubtype();
-            if (type.equals(TEXT)) {
-                if (subtype.equals(HTML)) {
-                    return HTML;
-                } else if (subtype.equals(PLAIN)) {
-                    return "txt";
-                } else if (subtype.equals(CSS)) {
-                    return CSS;
-                } else if (subtype.equals(CSV)) {
-                    return CSV;
-                }
-            } else if (type.equals(APPLICATION)) {
-                if (subtype.equals(XML)) {
-                    return XML;
-                } else if (subtype.equals(XHTML_XML)) {
-                    return "html";
-                }
-            }
-            return EMPTY_STRING;
-        }
-    }
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java b/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java
deleted file mode 100644
index 181a028..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-public class TableInfo {
-
-    private final String name;
-    private final List<ColInfo> colInfos = new ArrayList<>();
-    private final Set<Cols> colNames = new HashSet<>();
-
-    private String prefix;
-
-    public TableInfo(String name, ColInfo... cols) {
-        Collections.addAll(colInfos, cols);
-        Collections.unmodifiableList(colInfos);
-        this.name = name;
-        for (ColInfo c : colInfos) {
-            assert (!colNames.contains(c.getName()));
-            colNames.add(c.getName());
-        }
-    }
-
-    public TableInfo(String name, List<ColInfo> cols) {
-        colInfos.addAll(cols);
-        Collections.unmodifiableList(colInfos);
-        this.name = name;
-        for (ColInfo c : colInfos) {
-            assert (!colNames.contains(c.getName()));
-            colNames.add(c.getName());
-        }
-    }
-
-    public String getName() {
-        if (prefix == null) {
-            return name;
-        }
-        return prefix+name;
-    }
-
-    public void setNamePrefix(String prefix) {
-        this.prefix = prefix;
-    }
-
-    public List<ColInfo> getColInfos() {
-        return colInfos;
-    }
-
-    public boolean containsColumn(Cols cols) {
-        return colNames.contains(cols);
-    }
-}
-
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
deleted file mode 100644
index 5d81016..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.io;
-
-import java.io.IOException;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
-
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.tika.eval.db.ColInfo;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.eval.db.MimeBuffer;
-import org.apache.tika.eval.db.TableInfo;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This is still in its early stages.  The idea is to
- * get something working with h2 and then add to that
- * as necessary.
- *
- * Beware, this deletes the db file with each initialization.
- *
- * Each thread must construct its own DBWriter because each
- * DBWriter creates its own PreparedStatements at initialization.
- */
-public class DBWriter implements IDBWriter {
-
-    private static final Logger LOG = LoggerFactory.getLogger(DBWriter.class);
-
-    private static final AtomicInteger WRITER_ID = new AtomicInteger();
-    private final Long commitEveryXRows = 10000L;
-    //private final Long commitEveryXMS = 60000L;
-
-    private final Connection conn;
-    private final JDBCUtil dbUtil;
-    private final MimeBuffer mimeBuffer;
-    private final int myId = WRITER_ID.getAndIncrement();
-
-    //<tableName, preparedStatement>
-    private final Map<String, PreparedStatement> inserts = new HashMap<>();
-    private final Map<String, LastInsert> lastInsertMap = new HashMap<>();
-    public DBWriter(Connection connection, List<TableInfo> tableInfos, JDBCUtil dbUtil, MimeBuffer mimeBuffer)
-            throws IOException, SQLException {
-
-        this.conn = connection;
-        this.mimeBuffer = mimeBuffer;
-        this.dbUtil = dbUtil;
-        for (TableInfo tableInfo : tableInfos) {
-            try {
-                PreparedStatement st = createPreparedInsert(tableInfo);
-                inserts.put(tableInfo.getName(), st);
-                lastInsertMap.put(tableInfo.getName(), new LastInsert());
-            } catch (SQLException e) {
-                throw new RuntimeException(e);
-            }
-        }
-    }
-
-    public int getMimeId(String mimeString) {
-        return mimeBuffer.getId(mimeString);
-    }
-
-    private PreparedStatement createPreparedInsert(TableInfo tableInfo) throws SQLException {
-        StringBuilder sb = new StringBuilder();
-        sb.append("INSERT INTO ").append(tableInfo.getName());
-        sb.append("(");
-        int i = 0;
-        for (ColInfo c : tableInfo.getColInfos()) {
-            if (i++ > 0) {
-                sb.append(", ");
-            }
-            sb.append(c.getName());
-        }
-        sb.append(") ");
-
-        sb.append("VALUES");
-        sb.append("(");
-        for (int j = 0; j < i; j++) {
-            if (j > 0) {
-                sb.append(", ");
-            }
-            sb.append("?");
-        }
-        sb.append(")");
-
-        return conn.prepareStatement(sb.toString());
-    }
-
-
-    public void writeRow(TableInfo table, Map<Cols, String> data) throws IOException {
-        try {
-            PreparedStatement p = inserts.get(table.getName());
-            if (p == null) {
-                throw new RuntimeException("Failed to create prepared statement for: "+
-                        table.getName());
-            }
-            dbUtil.batchInsert(p, table, data);
-            LastInsert lastInsert = lastInsertMap.get(table.getName());
-            lastInsert.rowCount++;
-            long elapsed = System.currentTimeMillis()-lastInsert.lastInsert;
-            if (
-                    //elapsed > commitEveryXMS ||
-                lastInsert.rowCount % commitEveryXRows == 0) {
-                LOG.info("writer ({}) on table ({}) is committing after {} rows and {} ms", myId,
-                        table.getName(),
-                        lastInsert.rowCount, elapsed);
-                p.executeBatch();
-                conn.commit();
-                lastInsert.lastInsert = System.currentTimeMillis();
-            }
-        } catch (SQLException e) {
-            throw new IOException(e);
-        }
-    }
-
-    /**
-     * This closes the writer by executing batch and
-     * committing changes.  This DOES NOT close the connection
-     * @throws IOException
-     */
-    public void close() throws IOException {
-        for (PreparedStatement p : inserts.values()) {
-            try {
-                p.executeBatch();
-            } catch (SQLException e) {
-                throw new IOExceptionWithCause(e);
-            }
-        }
-        try {
-            conn.commit();
-        } catch (SQLException e){
-            throw new IOExceptionWithCause(e);
-        }
-    }
-
-    private class LastInsert {
-        private long lastInsert = System.currentTimeMillis();
-        private long rowCount = 0;
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
deleted file mode 100644
index d48a4de..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.io;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
-import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
-import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.serialization.JsonMetadataList;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
-import org.apache.tika.sax.RecursiveParserWrapperHandler;
-import org.apache.tika.sax.ToTextContentHandler;
-import org.apache.tika.sax.ToXMLContentHandler;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-
-public class ExtractReader {
-    private static final Logger LOG = LoggerFactory.getLogger(ExtractReader.class);
-
-    public static final long IGNORE_LENGTH = -1L;
-
-    public enum ALTER_METADATA_LIST {
-        AS_IS,  //leave the metadata list as is
-        FIRST_ONLY, //take only the metadata list for the "container" document
-        CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into the first
-    }
-
-    private TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
-
-    private final ALTER_METADATA_LIST alterMetadataList;
-    private final long minExtractLength;
-    private final long maxExtractLength;
-
-    /**
-     * Reads full extract, no modification of metadata list, no min or max extract length checking
-     */
-    public ExtractReader() {
-        this(ALTER_METADATA_LIST.AS_IS, IGNORE_LENGTH, IGNORE_LENGTH);
-    }
-
-    public ExtractReader(ALTER_METADATA_LIST alterMetadataList) {
-        this(alterMetadataList, IGNORE_LENGTH, IGNORE_LENGTH);
-    }
-
-    public ExtractReader(ALTER_METADATA_LIST alterMetadataList, long minExtractLength, long maxExtractLength) {
-        this.alterMetadataList = alterMetadataList;
-        this.minExtractLength = minExtractLength;
-        this.maxExtractLength = maxExtractLength;
-        if (maxExtractLength > IGNORE_LENGTH && minExtractLength >= maxExtractLength) {
-            throw new IllegalArgumentException("minExtractLength("+minExtractLength+
-                    ") must be < maxExtractLength("+maxExtractLength+")");
-        }
-    }
-    public List<Metadata> loadExtract(Path extractFile) throws ExtractReaderException {
-
-        List<Metadata> metadataList = null;
-        if (extractFile == null || !Files.isRegularFile(extractFile)) {
-            throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
-        }
-
-        FileSuffixes fileSuffixes = parseSuffixes(extractFile.getFileName().toString());
-        if (fileSuffixes.format == null) {
-            throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX);
-        }
-        if (! Files.isRegularFile(extractFile)) {
-            throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
-        }
-
-        long length = -1L;
-        try {
-            length = Files.size(extractFile);
-        } catch (IOException e) {
-            throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
-        }
-
-        if (length == 0L) {
-            throw new ExtractReaderException(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE);
-        }
-
-        if (minExtractLength > IGNORE_LENGTH && length < minExtractLength) {
-            throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_SHORT);
-        }
-        if (maxExtractLength > IGNORE_LENGTH && length > maxExtractLength) {
-            throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_LONG);
-        }
-
-        Reader reader = null;
-        InputStream is = null;
-        try {
-            is = Files.newInputStream(extractFile);
-            if (fileSuffixes.compression != null) {
-                if (fileSuffixes.compression.equals("bz2")) {
-                    is = new BZip2CompressorInputStream(is);
-                } else if (fileSuffixes.compression.equals("gz")
-                        || fileSuffixes.compression.equals("gzip")) {
-                    is = new GzipCompressorInputStream(is);
-                } else if (fileSuffixes.compression.equals("zip")) {
-                    is = new ZCompressorInputStream(is);
-                } else {
-                    LOG.warn("Can't yet process compression of type: {}", fileSuffixes.compression);
-                    return metadataList;
-                }
-            }
-            reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
-        } catch (IOException e) {
-            throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
-        }
-
-        try {
-            if (fileSuffixes.format == FileSuffixes.FORMAT.JSON) {
-                metadataList = JsonMetadataList.fromJson(reader);
-                if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) {
-                    while (metadataList.size() > 1) {
-                        metadataList.remove(metadataList.size()-1);
-                    }
-                } else if (alterMetadataList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) &&
-                        metadataList.size() > 1) {
-                    StringBuilder sb = new StringBuilder();
-                    Metadata containerMetadata = metadataList.get(0);
-                    for (Metadata m : metadataList) {
-                        String c = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
-                        if (c != null) {
-                            sb.append(c);
-                            sb.append(" ");
-                        }
-                    }
-                    containerMetadata.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, sb.toString());
-                    while (metadataList.size() > 1) {
-                        metadataList.remove(metadataList.size()-1);
-                    }
-                }
-            } else {
-                metadataList = generateListFromTextFile(reader, fileSuffixes);
-            }
-        } catch (IOException e) {
-            throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
-        } catch (TikaException e) {
-            throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_PARSE_EXCEPTION);
-        } finally {
-            IOUtils.closeQuietly(reader);
-            IOUtils.closeQuietly(is);
-        }
-        return metadataList;
-    }
-
-    private List<Metadata> generateListFromTextFile(Reader reader,
-                                                           FileSuffixes fileSuffixes) throws IOException {
-        List<Metadata> metadataList = new ArrayList<>();
-        String content = IOUtils.toString(reader);
-        Metadata m = new Metadata();
-        m.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content);
-        if (fileSuffixes.format == FileSuffixes.FORMAT.HTML) {
-            m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToXMLContentHandler.class.getSimpleName());
-        } else if (fileSuffixes.format == FileSuffixes.FORMAT.TXT) {
-            m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToTextContentHandler.class.getSimpleName());
-        }
-        //Let's hope the file name has a suffix that can
-        //be used to determine the mime.  Could be wrong or missing,
-        //but better than nothing.
-        m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileSuffixes.originalFileName);
-
-        MediaType mimeType = tikaConfig.getMimeRepository().detect(null, m);
-        if (mimeType != null) {
-            m.set(Metadata.CONTENT_TYPE, mimeType.toString());
-        }
-        metadataList.add(m);
-        return metadataList;
-
-    }
-
-    protected static FileSuffixes parseSuffixes(String fName) {
-        FileSuffixes fileSuffixes = new FileSuffixes();
-        if (fName == null) {
-            return fileSuffixes;
-        }
-        Matcher m = Pattern.compile("(?i)^(.*?)\\.(json|txt|x?html)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName);
-        if (m.find()) {
-            fileSuffixes.originalFileName = m.group(1);
-            fileSuffixes.setFormat(m.group(2));
-            fileSuffixes.compression = m.group(3);
-        }
-        return fileSuffixes;
-    }
-
-    private static class FileSuffixes {
-
-        enum FORMAT {
-            TXT,
-            HTML,
-            JSON
-        }
-        String compression;
-        FORMAT format;
-        String originalFileName;
-
-        public void setFormat(String fmt) {
-            String lc = fmt.toLowerCase(Locale.ENGLISH);
-            if (lc.equals("json")) {
-                format = FORMAT.JSON;
-            } else if (lc.equals("txt")) {
-                format = FORMAT.TXT;
-            } else if (lc.contains("html")) {
-                format = FORMAT.HTML;
-            } else {
-                throw new IllegalArgumentException("extract must end in .json, .txt or .xhtml");
-            }
-        }
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReaderException.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReaderException.java
deleted file mode 100644
index 5aa2f3f..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReaderException.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.io;
-
-import java.io.IOException;
-
-/**
- * Exception when trying to read extract
- */
-public class ExtractReaderException extends IOException {
-
-    public enum TYPE {
-        //what do you see when you look at the extract file
-        NO_EXTRACT_FILE,
-        ZERO_BYTE_EXTRACT_FILE,
-        IO_EXCEPTION,
-        EXTRACT_PARSE_EXCEPTION,
-        EXTRACT_FILE_TOO_SHORT,
-        EXTRACT_FILE_TOO_LONG,
-        INCORRECT_EXTRACT_FILE_SUFFIX;//extract file must have suffix of .json or .txt,
-        // optionally followed by gzip, zip or bz2
-    }
-
-    private final TYPE type;
-
-    public ExtractReaderException(TYPE exceptionType) {
-        this.type = exceptionType;
-    }
-
-    public TYPE getType() {
-        return type;
-    }
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java b/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java
deleted file mode 100644
index b2b76ab..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval.io;
-
-
-import java.io.IOException;
-import java.util.Map;
-
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.TableInfo;
-
-public interface IDBWriter {
-    public void writeRow(TableInfo table, Map<Cols, String> data) throws IOException;
-    public void close() throws IOException;
-    public int getMimeId(String mimeString);
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java
deleted file mode 100644
index 2d509f7..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.io;
-
-import java.io.IOException;
-import java.sql.SQLException;
-
-import org.apache.log4j.Level;
-
-public interface XMLLogMsgHandler {
-    public void handleMsg(Level level, String xml) throws IOException, SQLException;
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
deleted file mode 100644
index 82c1b8b..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.io;
-
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.stream.XMLStreamConstants;
-import javax.xml.stream.XMLStreamException;
-import javax.xml.stream.XMLStreamReader;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import java.sql.SQLException;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.log4j.Level;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.utils.XMLReaderUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-
-public class XMLLogReader {
-    private static final Logger LOG = LoggerFactory.getLogger(XMLLogReader.class);
-    //class that wraps a logger's xml output
-    //into a single xml parseable input stream.
-
-    public void read(InputStream xmlLogFileIs, XMLLogMsgHandler handler) throws XMLStreamException {
-        InputStream is = new LogXMLWrappingInputStream(xmlLogFileIs);
-        XMLInputFactory factory = XMLReaderUtils.getXMLInputFactory();
-        XMLStreamReader reader = factory.createXMLStreamReader(is);
-
-        Level level = null;
-        while (reader.hasNext()) {
-            reader.next();
-            switch (reader.getEventType()) {
-                case XMLStreamConstants.START_ELEMENT :
-                    if ("event".equals(reader.getLocalName())) {
-                        level = Level.toLevel(reader.getAttributeValue("", "level"), Level.DEBUG);
-                    } else if ("message".equals(reader.getLocalName())) {
-                        try {
-                            handler.handleMsg(level, reader.getElementText());
-                        } catch (IOException e) {
-                            LOG.warn("Error parsing: {}", reader.getElementText());
-                        } catch (SQLException e) {
-                            LOG.warn("SQLException: {}", e.getMessage());
-                        }
-                    }
-                    break;
-                case XMLStreamConstants.END_ELEMENT :
-                    if ("event".equals(reader.getLocalName())) {
-                        level = null;
-                    } else if ("message".equals(reader.getLocalName())) {
-                        //do we care any more?
-                    }
-                    break;
-            };
-        }
-    }
-
-
-
-    class LogXMLWrappingInputStream extends InputStream {
-        //plagiarized from log4j's chainsaw
-        private final static String HEADER =
-                "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
-                        + "<log4j:eventSet version=\"1.2\" "
-                        + "xmlns:log4j=\"http://jakarta.apache.org/log4j/\">";
-        private static final String FOOTER = "</log4j:eventSet>";
-
-        private InputStream[] streams;
-        int currentStreamIndex = 0;
-
-        private LogXMLWrappingInputStream(InputStream xmlLogFileIs){
-            streams = new InputStream[3];
-            streams[0] = new ByteArrayInputStream(HEADER.getBytes(StandardCharsets.UTF_8));
-            streams[1] = xmlLogFileIs;
-            streams[2] = new ByteArrayInputStream(FOOTER.getBytes(StandardCharsets.UTF_8));
-
-        }
-
-        @Override
-        public int read() throws IOException {
-            int c = streams[currentStreamIndex].read();
-            if (c < 0) {
-                IOUtils.closeQuietly(streams[currentStreamIndex]);
-                while (currentStreamIndex < streams.length-1) {
-                    currentStreamIndex++;
-                    int tmpC = streams[currentStreamIndex].read();
-                    if (tmpC < 0) {
-                        IOUtils.closeQuietly(streams[currentStreamIndex]);
-                    } else {
-                        return tmpC;
-                    }
-                }
-                return -1;
-            }
-            return c;
-        }
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
deleted file mode 100644
index 41a1424..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
+++ /dev/null
@@ -1,198 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.reports;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.Connection;
-import java.sql.ResultSet;
-import java.sql.ResultSetMetaData;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.sql.Types;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.CellStyle;
-import org.apache.poi.ss.usermodel.Row;
-import org.apache.poi.ss.usermodel.VerticalAlignment;
-import org.apache.poi.xssf.streaming.SXSSFSheet;
-import org.apache.poi.xssf.streaming.SXSSFWorkbook;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This class represents a single report.
- */
-public class Report {
-    private static final Logger LOG = LoggerFactory.getLogger(Report.class);
-
-    final String NULL_VALUE = "";//TODO: make this configurable!!!
-    Map<String, XSLXCellFormatter> cellFormatters = new HashMap<>();
-    private XLSXNumFormatter defaultDoubleFormatter = new XLSXNumFormatter("0.000");
-    private XLSXNumFormatter defaultIntegerFormatter = new XLSXNumFormatter("0");
-    private CellStyle sqlCellStyle;
-
-    String sql;
-    String reportFilename;
-    boolean includeSql = true;
-
-    String reportName;
-
-    public void writeReport(Connection c, Path reportsRoot) throws SQLException, IOException {
-        LOG.info("Writing report: {} to {}", reportName, reportFilename);
-        dumpXLSX(c, reportsRoot);
-    }
-
-    private void dumpXLSX(Connection c, Path reportsRoot) throws IOException, SQLException {
-        Statement st = c.createStatement();
-        Path out = reportsRoot.resolve(reportFilename);
-        Files.createDirectories(out.getParent());
-
-        SXSSFWorkbook wb = new SXSSFWorkbook(new XSSFWorkbook(), 100, true, true);
-        wb.setCompressTempFiles(true);
-        defaultIntegerFormatter.reset(wb.getXSSFWorkbook());
-        defaultDoubleFormatter.reset(wb.getXSSFWorkbook());
-        sqlCellStyle = wb.createCellStyle();
-        sqlCellStyle.setVerticalAlignment(VerticalAlignment.TOP);
-        sqlCellStyle.setWrapText(true);
-
-
-        try {
-            dumpReportToWorkbook(st, wb);
-        } finally {
-            try (OutputStream os = Files.newOutputStream(out)) {
-                wb.write(os);
-            } finally {
-                wb.dispose();
-            }
-        }
-    }
-
-    private void dumpReportToWorkbook(Statement st, SXSSFWorkbook wb) throws IOException, SQLException {
-        ResultSet rs = st.executeQuery(sql);
-
-        SXSSFSheet sheet = wb.createSheet("tika-eval Report");
-        sheet.trackColumnForAutoSizing(0);
-
-        int rowCount = 0;
-        ResultSetMetaData meta = rs.getMetaData();
-        Set<String> colNames = new HashSet<>();
-
-        Row xssfRow = sheet.createRow(rowCount++);
-        //write headers and cache them to check against styles
-        for (int i = 1; i <= meta.getColumnCount(); i++) {
-            Cell cell = xssfRow.createCell(i-1);
-            cell.setCellValue(meta.getColumnLabel(i));
-            colNames.add(meta.getColumnLabel(i));
-        }
-
-        ResultSetMetaData resultSetMetaData = rs.getMetaData();
-        while (rs.next()) {
-            xssfRow = sheet.createRow(rowCount++);
-            for (int i = 1; i <= meta.getColumnCount(); i++) {
-                Cell cell = xssfRow.createCell(i-1);
-                XSLXCellFormatter formatter = cellFormatters.get(meta.getColumnLabel(i));
-                if (formatter == null) {
-                    formatter = getDefaultFormatter(resultSetMetaData.getColumnType(i));
-                }
-                if (formatter != null) {
-                    formatter.applyStyleAndValue(i, rs, cell);
-                } else {
-                    writeCell(meta, i, rs, cell);
-                }
-            }
-        }
-        sheet.autoSizeColumn(0);
-
-        if (!includeSql) {
-            return;
-        }
-
-        SXSSFSheet sqlSheet = wb.createSheet("tika-eval SQL");
-        sqlSheet.setColumnWidth(0, 100*250);
-        Row sqlRow = sqlSheet.createRow(0);
-        short height = 5000;
-        sqlRow.setHeight(height);
-        Cell cell = sqlRow.createCell(0);
-        cell.setCellStyle(sqlCellStyle);
-
-        cell.setCellValue(sql.trim());//.replaceAll("[\r\n]+", "\r\n"));
-    }
-
-    private XSLXCellFormatter getDefaultFormatter(int columnType) {
-        switch (columnType) {
-            case Types.INTEGER :
-                return defaultIntegerFormatter;
-            case Types.DOUBLE:
-            case Types.FLOAT:
-            case Types.DECIMAL:
-                return defaultDoubleFormatter;
-            default:
-                return null;
-        }
-    }
-
-    private void writeCell(ResultSetMetaData meta, int colIndex, ResultSet rs,
-                           Cell cell) throws SQLException {
-
-        switch(meta.getColumnType(colIndex)) {
-            //fall through on numerics
-            case Types.BIGINT:
-            case Types.SMALLINT:
-            case Types.INTEGER:
-            case Types.DOUBLE:
-            case Types.FLOAT:
-            case Types.DECIMAL:
-            case Types.REAL:
-            case Types.NUMERIC:
-                double dbl = rs.getDouble(colIndex);
-                if (rs.wasNull()) {
-                    cell.setCellValue(NULL_VALUE);
-                } else {
-                    cell.setCellValue(dbl);
-                }
-                break;
-            //fall through strings
-            case Types.BOOLEAN:
-            case Types.CHAR:
-            case Types.VARCHAR:
-            case Types.LONGNVARCHAR:
-                String val = rs.getString(colIndex);
-                if (rs.wasNull()) {
-                    cell.setCellValue(NULL_VALUE);
-                } else {
-                    cell.setCellValue(val);
-                }
-                break;
-            default:
-                if (rs.wasNull()) {
-                    cell.setCellValue(NULL_VALUE);
-                } else {
-                    cell.setCellValue(rs.getString(colIndex));
-                }
-                LOG.warn("Couldn't find type for: {}. Defaulting to String", meta.getColumnType(colIndex));
-        }
-    }
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
deleted file mode 100644
index 3ede4e6..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.reports;
-
-
-import javax.xml.parsers.DocumentBuilder;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.nio.file.StandardCopyOption;
-import java.sql.Connection;
-import java.sql.DatabaseMetaData;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.DefaultParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Options;
-import org.apache.commons.cli.ParseException;
-import org.apache.poi.common.usermodel.HyperlinkType;
-import org.apache.tika.eval.ExtractComparer;
-import org.apache.tika.eval.ExtractProfiler;
-import org.apache.tika.eval.db.H2Util;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.utils.XMLReaderUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.w3c.dom.Document;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-
-public class ResultsReporter {
-    private static final Logger LOG = LoggerFactory.getLogger(ResultsReporter.class);
-
-    private static Options OPTIONS;
-
-    static {
-        OPTIONS = new Options();
-        OPTIONS.addOption("rd", "reportsDir", true, "directory for the reports. " +
-                "If not specified, will write to 'reports'" +
-                "BEWARE: Will overwrite existing reports without warning!")
-                .addOption("rf", "reportsFile", true, "xml specifying sql to call for the reports." +
-                        "If not specified, will use default reports in resources/tika-eval-*-config.xml")
-                .addOption("db", true, "default database (in memory H2). Specify a file name for the H2 database.")
-                .addOption("jdbc", true, "EXPERT: full jdbc connection string. Specify this or use -db <h2db_name>")
-                .addOption("jdbcdriver", true, "EXPERT: specify the jdbc driver class if all else fails")
-                .addOption("tablePrefix", true, "EXPERT: if not using the default tables, specify your table name prefix");
-
-    }
-
-    public static void USAGE() {
-        HelpFormatter helpFormatter = new HelpFormatter();
-        helpFormatter.printHelp(
-                80,
-                "java -jar tika-eval-x.y.jar Report -db mydb [-rd myreports] [-rf myreports.xml]",
-                "Tool: Report",
-                ResultsReporter.OPTIONS,
-                "Note: for h2 db, do not include the .mv.db at the end of the db name.");
-
-    }
-
-
-    List<String> before = new ArrayList<>();
-    List<String> after = new ArrayList<>();
-    List<Report> reports = new ArrayList<>();
-
-
-    private void addBefore(String b) {
-        before.add(b);
-    }
-
-    private void addAfter(String a) {
-        after.add(a);
-    }
-
-    private void addReport(Report r) {
-        reports.add(r);
-    }
-
-    public static ResultsReporter build(Path p) throws Exception {
-
-        ResultsReporter r = new ResultsReporter();
-
-        DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder();
-        Document doc;
-        try (InputStream is = Files.newInputStream(p)) {
-            doc = docBuilder.parse(is);
-        }
-        Node docElement = doc.getDocumentElement();
-        assert (docElement.getNodeName().equals("reports"));
-        NodeList children = docElement.getChildNodes();
-        for (int i = 0; i < children.getLength(); i++) {
-            Node n = children.item(i);
-            if ("before".equals(n.getNodeName())) {
-                for (String before : getSql(n)) {
-                    r.addBefore(before);
-                }
-            } else if ("after".equals(n.getNodeName())) {
-                for (String after : getSql(n)) {
-                    r.addAfter(after);
-                }
-            } else if ("report".equals(n.getNodeName())) {
-                Report report = buildReport(n);
-                r.addReport(report);
-            }
-        }
-
-        return r;
-    }
-
-    private static Report buildReport(Node n) {
-        NodeList children = n.getChildNodes();
-        Report r = new Report();
-        NamedNodeMap attrs = n.getAttributes();
-
-        r.includeSql = Boolean.parseBoolean(attrs.getNamedItem("includeSql").getNodeValue());
-        r.reportFilename = attrs.getNamedItem("reportFilename").getNodeValue();
-        r.reportName = attrs.getNamedItem("reportName").getNodeValue();
-
-        for (int i = 0; i < children.getLength(); i++) {
-            Node child = children.item(i);
-            if (child.getNodeType() != 1) {
-                continue;
-            }
-            if ("sql".equals(child.getNodeName())) {
-                if (r.sql != null) {
-                    throw new IllegalArgumentException("Can only have one sql statement per report");
-                }
-                r.sql = child.getTextContent();
-            } else if ("colformats".equals(child.getNodeName())) {
-                r.cellFormatters = getCellFormatters(child);
-            } else {
-                throw new IllegalArgumentException("Not expecting to see:" + child.getNodeName());
-            }
-        }
-        return r;
-    }
-
-    private static Map<String, XSLXCellFormatter> getCellFormatters(Node n) {
-        NodeList children = n.getChildNodes();
-        Map<String, XSLXCellFormatter> ret = new HashMap<>();
-        for (int i = 0; i < children.getLength(); i++) {
-            Node child = children.item(i);
-            if (child.getNodeType() != 1) {
-                continue;
-            }
-            NamedNodeMap attrs = child.getAttributes();
-            String columnName = attrs.getNamedItem("name").getNodeValue();
-            assert (!ret.containsKey(columnName));
-            String type = attrs.getNamedItem("type").getNodeValue();
-            if ("numberFormatter".equals(type)) {
-                String format = attrs.getNamedItem("format").getNodeValue();
-                XSLXCellFormatter f = new XLSXNumFormatter(format);
-                ret.put(columnName, f);
-            } else if ("urlLink".equals(type)) {
-                String base = "";
-                Node baseNode = attrs.getNamedItem("base");
-                if (baseNode != null) {
-                    base = baseNode.getNodeValue();
-                }
-                XLSXHREFFormatter f = new XLSXHREFFormatter(base, HyperlinkType.URL);
-                ret.put(columnName, f);
-            } else if ("fileLink".equals(type)) {
-                String base = "";
-                Node baseNode = attrs.getNamedItem("base");
-                if (baseNode != null) {
-                    base = baseNode.getNodeValue();
-                }
-                XLSXHREFFormatter f = new XLSXHREFFormatter(base, HyperlinkType.FILE);
-                ret.put(columnName, f);
-            }
-        }
-        return ret;
-    }
-
-    private static List<String> getSql(Node n) {
-        List<String> ret = new ArrayList<>();
-
-        NodeList children = n.getChildNodes();
-
-        for (int i = 0; i < children.getLength(); i++) {
-            Node child = children.item(i);
-            if (child.getNodeType() != 1) {
-                continue;
-            }
-            ret.add(child.getTextContent());
-        }
-        return ret;
-    }
-
-    public static void main(String[] args) throws Exception {
-
-        DefaultParser defaultCLIParser = new DefaultParser();
-        CommandLine commandLine = null;
-        try {
-            commandLine = defaultCLIParser.parse(OPTIONS, args);
-        } catch (ParseException e) {
-            System.out.println(e.getMessage());
-            USAGE();
-            return;
-        }
-        JDBCUtil dbUtil = null;
-        if (commandLine.hasOption("db")) {
-            String dbString = commandLine.getOptionValue("db");
-            if (dbString.endsWith(".mv.db")) {
-                dbString = dbString.substring(0, dbString.length()-6);
-                LOG.debug("trimming .mv.db from db name");
-            }
-            Path db = Paths.get(dbString);
-            if (!H2Util.databaseExists(db)) {
-                throw new RuntimeException("I'm sorry, but I couldn't find this h2 database: " + db);
-            }
-            dbUtil = new H2Util(db);
-        } else if (commandLine.hasOption("jdbc")) {
-            String driverClass = null;
-            if (commandLine.hasOption("jdbcdriver")) {
-                driverClass = commandLine.getOptionValue("jdbcdriver");
-            }
-            dbUtil = new JDBCUtil(commandLine.getOptionValue("jdbc"), driverClass);
-        } else {
-            System.err.println("Must specify either -db for the default in-memory h2 database\n" +
-                    "or -jdbc for a full jdbc connection string");
-            USAGE();
-            return;
-        }
-        try (Connection c = dbUtil.getConnection()) {
-            Path tmpReportsFile = null;
-            try {
-                ResultsReporter resultsReporter = null;
-                String reportsFile = commandLine.getOptionValue("rf");
-                if (reportsFile == null) {
-                    tmpReportsFile = getDefaultReportsConfig(c);
-                    resultsReporter = ResultsReporter.build(tmpReportsFile);
-                } else {
-                    resultsReporter = ResultsReporter.build(Paths.get(reportsFile));
-                }
-
-                Path reportsRootDirectory = Paths.get(commandLine.getOptionValue("rd", "reports"));
-                if (Files.isDirectory(reportsRootDirectory)) {
-                    LOG.warn("'Reports' directory exists.  Will overwrite existing reports.");
-                }
-
-                resultsReporter.execute(c, reportsRootDirectory);
-            } finally {
-                if (tmpReportsFile != null) {
-                    Files.delete(tmpReportsFile);
-                }
-            }
-        }
-    }
-
-    private static Path getDefaultReportsConfig(Connection c) throws IOException, SQLException {
-        DatabaseMetaData md = c.getMetaData();
-        String internalPath = null;
-        try (ResultSet rs = md.getTables(null, null, "%", null)) {
-            while (rs.next()) {
-                String tName = rs.getString(3);
-                if (ExtractComparer.CONTENTS_TABLE_B.getName().equalsIgnoreCase(tName)) {
-                    internalPath = "/comparison-reports.xml";
-                    break;
-                } else if (ExtractProfiler.PROFILE_TABLE.getName().equalsIgnoreCase(tName)) {
-                    internalPath = "/profile-reports.xml";
-                    break;
-                }
-            }
-        }
-
-        if (internalPath == null) {
-            throw new RuntimeException("Couldn't determine if this database was a 'profiler' or 'comparison' db");
-        }
-        Path tmp = Files.createTempFile("tmp-tika-reports", ".xml");
-        Files.copy(ResultsReporter.class.getResourceAsStream(internalPath), tmp, StandardCopyOption.REPLACE_EXISTING);
-        return tmp;
-    }
-
-    public void execute(Connection c, Path reportsDirectory) throws IOException, SQLException {
-        Statement st = c.createStatement();
-        for (String sql : before) {
-            LOG.info("processing before: {}", sql);
-            st.execute(sql);
-        }
-        for (Report r : reports) {
-            r.writeReport(c, reportsDirectory);
-        }
-        for (String sql : after) {
-            LOG.info("processing after: {}", sql);
-            st.execute(sql);
-        }
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java
deleted file mode 100644
index 11b72f4..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.reports;
-
-import java.nio.file.Paths;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-
-import org.apache.poi.common.usermodel.HyperlinkType;
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.CellStyle;
-import org.apache.poi.ss.usermodel.Font;
-import org.apache.poi.ss.usermodel.Hyperlink;
-import org.apache.poi.ss.usermodel.IndexedColors;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-
-import javax.swing.event.HyperlinkListener;
-
-
-public class XLSXHREFFormatter implements XSLXCellFormatter {
-    //xlsx files can only have this many hyperlinks
-    //if they have more Excel can't read the file
-    private static final int MAX_HYPERLINKS = 65000;
-
-
-    private final String urlBase;
-    private final HyperlinkType linkType;
-    private XSSFWorkbook workbook;
-    private CellStyle style;
-    private int links = 0;
-
-    public XLSXHREFFormatter(String urlBase,
-                             HyperlinkType hyperlinkType) {
-        this.urlBase = urlBase;
-        this.linkType = hyperlinkType;
-    }
-
-    @Override
-    public void reset(XSSFWorkbook workbook) {
-        this.workbook = workbook;
-        style = workbook.createCellStyle();
-        Font hlinkFont = workbook.createFont();
-        hlinkFont.setUnderline(Font.U_SINGLE);
-        hlinkFont.setColor(IndexedColors.BLUE.getIndex());
-        style.setFont(hlinkFont);
-        links = 0;
-
-    }
-
-    @Override
-    public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException {
-        if (links < MAX_HYPERLINKS) {
-            Hyperlink hyperlink = workbook.getCreationHelper().createHyperlink(linkType);
-            String path = resultSet.getString(dbColNum);
-            String address = urlBase+path;
-            hyperlink.setAddress(address);
-            cell.setHyperlink(hyperlink);
-            cell.setCellStyle(style);
-            String fName = Paths.get(path).getFileName().toString();
-            cell.setCellValue(fName);
-            links++;
-        } else {
-            //silently stop adding hyperlinks
-        }
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java
deleted file mode 100644
index b2ba5b0..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval.reports;
-
-import java.sql.ResultSet;
-import java.sql.SQLException;
-
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.CellStyle;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-
-class XLSXNumFormatter implements XSLXCellFormatter {
-
-    private final String formatString;
-    private CellStyle style;
-
-    XLSXNumFormatter(String formatString) {
-        this.formatString = formatString;
-    }
-
-
-    @Override
-    public void reset(XSSFWorkbook workbook) {
-        style = workbook.createCellStyle();
-        style.setDataFormat(workbook.getCreationHelper()
-                .createDataFormat().getFormat(formatString));
-    }
-
-    @Override
-    public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException {
-        double d = resultSet.getDouble(dbColNum);
-        if (resultSet.wasNull()) {
-
-        } else {
-            cell.setCellStyle(style);
-        }
-        cell.setCellValue(resultSet.getDouble(dbColNum));
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java
deleted file mode 100644
index eb0e024..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.reports;
-
-
-import java.sql.ResultSet;
-import java.sql.SQLException;
-
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-
-interface XSLXCellFormatter {
-
-    public void reset(XSSFWorkbook workbook);
-    public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException;
-}
\ No newline at end of file
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/BatchTopCommonTokenCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/BatchTopCommonTokenCounter.java
deleted file mode 100644
index 70d828e..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/BatchTopCommonTokenCounter.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.utils.ProcessUtils;
-
-/**
- * Utility class that runs TopCommonTokenCounter against a directory
- * of table files (named {lang}_table.gz or leipzip-like afr_...-sentences.txt)
- * and outputs common tokens files for each input table file in the output directory.
- */
-public class BatchTopCommonTokenCounter {
-
-    public static void main(String[] args) throws Exception {
-
-        Path commonTokensDir = Paths.get(args[0]);
-        Path tableFileDir = Paths.get(args[1]);
-        Map<String, List<Path>> langFiles = LeipzigHelper.getFiles(tableFileDir);
-
-        for (Map.Entry<String, List<Path>> e : langFiles.entrySet()) {
-
-            String[] cmd = new String[e.getValue().size()+1];
-            Path commonTokensFile = commonTokensDir.resolve(e.getKey());
-            cmd[0] = ProcessUtils.escapeCommandLine(commonTokensFile.toAbsolutePath().toString());
-            for (int i = 0; i < e.getValue().size(); i++) {
-                cmd[i+1] =
-                        ProcessUtils.escapeCommandLine(
-                                e.getValue().get(i).toAbsolutePath().toString());
-            }
-            TopCommonTokenCounter.main(
-                cmd
-            );
-        }
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/CommonTokenOverlapCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/CommonTokenOverlapCounter.java
deleted file mode 100644
index 3bd8198..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/CommonTokenOverlapCounter.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-import java.util.Set;
-
-import org.apache.tika.eval.tokens.CommonTokenCountManager;
-
-public class CommonTokenOverlapCounter {
-
-    public static void main(String[] args) throws Exception {
-        Path commonTokensDir = Paths.get(args[0]);
-        CommonTokenOverlapCounter counter = new CommonTokenOverlapCounter();
-        counter.execute(commonTokensDir);
-    }
-
-    private void execute(Path commonTokensDir) throws IOException {
-        List<String> langs = new ArrayList<>();
-        for (File f : commonTokensDir.toFile().listFiles()) {
-            langs.add(f.getName());
-        }
-        CommonTokenCountManager mgr = new CommonTokenCountManager(commonTokensDir, "");
-        for (int i = 0; i < langs.size()-1; i++) {
-            for (int j = i+1; j < langs.size(); j++) {
-                compare(langs.get(i), langs.get(j), mgr);
-            }
-        }
-    }
-
-    private void compare(String langA, String langB, CommonTokenCountManager mgr) {
-        int overlap = 0;
-        int denom = 0;
-        Set<String> setA = mgr.getTokens(langA);
-        Set<String> setB = mgr.getTokens(langB);
-        for (String a : setA) {
-            if (setB.contains(a)) {
-                overlap += 2;
-            }
-        }
-        denom = setA.size()+setB.size();
-        double percent = (double)overlap/(double)denom;
-        if (percent > 0.01) {
-            System.out.println(String.format(Locale.US, "%s %s %.2f", langA, langB, percent));
-        }
-    }
-
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigHelper.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigHelper.java
deleted file mode 100644
index daddd25..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigHelper.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-public class LeipzigHelper {
-
-    static Map<String, List<Path>> getFiles(Path leipzigDir) throws IOException {
-        Matcher tableMatcher = Pattern.compile("([a-z]+)_table(\\.txt)?(\\.gz)?$").matcher("");
-        Matcher leipzigMatcher = Pattern.compile("([a-z]{3,3})[-_].*$").matcher("");
-
-        Map<String, List<Path>> m = new TreeMap<>();
-        for (File f : leipzigDir.toFile().listFiles()) {
-            System.err.println(f);
-            String lang = null;
-            if (tableMatcher.reset(f.getName()).find()) {
-                lang = tableMatcher.group(1);
-            } else if (leipzigMatcher.reset(f.getName()).find()) {
-                lang = leipzigMatcher.group(1);
-            }
-            if (lang == null) {
-                System.err.println("couldn't find a lang: " + f);
-                continue;
-            }
-            List<Path> files = m.get(lang);
-            if (files == null) {
-                files = new ArrayList<>();
-            }
-            files.add(f.toPath());
-            m.put(lang, files);
-        }
-        return m;
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigSampler.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigSampler.java
deleted file mode 100644
index 5250505..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigSampler.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-public class LeipzigSampler {
-
-    public static void main(String[] args) throws Exception {
-        int sentsPerLanguage = 10;
-        Path leipzigDir = Paths.get(args[0]);
-        Path sampleFile = Paths.get(args[1]);
-        LeipzigSampler leipzigSampler = new LeipzigSampler();
-        try (BufferedWriter writer = Files.newBufferedWriter(sampleFile, StandardCharsets.UTF_8)) {
-            leipzigSampler.execute(leipzigDir, sentsPerLanguage, writer);
-        }
-    }
-
-    private void execute(Path leipzigDir, int sentsPerLang, BufferedWriter writer) throws IOException {
-        Map<String, List<Path>> fileMap = LeipzigHelper.getFiles(leipzigDir);
-        for (Map.Entry<String, List<Path>> e : fileMap.entrySet()) {
-            List<String> sentences = new ArrayList<>();
-            for (Path p : e.getValue()) {
-                addSentences(p, sentences);
-            }
-            Collections.shuffle(sentences);
-            String lang = e.getKey();
-            for (int i = 0; i < sentsPerLang; i++) {
-                writer.write(row(lang, sentences.get(i)));
-            }
-        }
-    }
-
-    private void addSentences(Path p, List<String> sentences) throws IOException {
-        try (BufferedReader reader = Files.newBufferedReader(p, StandardCharsets.UTF_8)) {
-            String line = reader.readLine();
-            while (line != null) {
-                int tab = line.indexOf("\t");
-                if (tab > -1) {
-                    line = line.substring(tab + 1);
-                }
-                sentences.add(line);
-                line = reader.readLine();
-            }
-        }
-    }
-
-    private String row(String lang, String s) {
-        s = s.replaceAll("\\s+", " ");
-        return lang+"\t"+s+"\n";
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
deleted file mode 100644
index 66283ff..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-
-
-import org.apache.lucene.index.BinaryDocValues;
-import org.apache.lucene.index.CompositeReader;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.DocValues;
-import org.apache.lucene.index.DocValuesType;
-import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.FieldInfos;
-import org.apache.lucene.index.Fields;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.LeafMetaData;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.MultiBits;
-import org.apache.lucene.index.MultiDocValues;
-import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
-import org.apache.lucene.index.MultiReader;
-import org.apache.lucene.index.MultiTerms;
-import org.apache.lucene.index.NumericDocValues;
-import org.apache.lucene.index.OrdinalMap;
-import org.apache.lucene.index.PointValues;
-import org.apache.lucene.index.SortedDocValues;
-import org.apache.lucene.index.SortedNumericDocValues;
-import org.apache.lucene.index.SortedSetDocValues;
-import org.apache.lucene.index.StoredFieldVisitor;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.Version;
-
-/**
- * <b>COPIED VERBATIM FROM LUCENE</b>
- * This class forces a composite reader (eg a {@link
- * MultiReader} or {@link DirectoryReader}) to emulate a
- * {@link LeafReader}.  This requires implementing the postings
- * APIs on-the-fly, using the static methods in {@link
- * MultiTerms}, {@link MultiDocValues}, by stepping through
- * the sub-readers to merge fields/terms, appending docs, etc.
- *
- * <p><b>NOTE</b>: this class almost always results in a
- * performance hit.  If this is important to your use case,
- * you'll get better performance by gathering the sub readers using
- * {@link IndexReader#getContext()} to get the
- * leaves and then operate per-LeafReader,
- * instead of using this class.
- */
-
-public final class SlowCompositeReaderWrapper extends LeafReader {
-
-    private final CompositeReader in;
-    private final LeafMetaData metaData;
-
-    // Cached copy of FieldInfos to prevent it from being re-created on each
-    // getFieldInfos call.  Most (if not all) other LeafReader implementations
-    // also have a cached FieldInfos instance so this is consistent. SOLR-12878
-    private final FieldInfos fieldInfos;
-
-    final Map<String, Terms> cachedTerms = new ConcurrentHashMap<>();
-
-    // TODO: consider ConcurrentHashMap ?
-    // TODO: this could really be a weak map somewhere else on the coreCacheKey,
-    // but do we really need to optimize slow-wrapper any more?
-    final Map<String, OrdinalMap> cachedOrdMaps = new HashMap<>();
-
-    /** This method is sugar for getting an {@link LeafReader} from
-     * an {@link IndexReader} of any kind. If the reader is already atomic,
-     * it is returned unchanged, otherwise wrapped by this class.
-     */
-    public static LeafReader wrap(IndexReader reader) throws IOException {
-        if (reader instanceof CompositeReader) {
-            return new SlowCompositeReaderWrapper((CompositeReader) reader);
-        } else {
-            assert reader instanceof LeafReader;
-            return (LeafReader) reader;
-        }
-    }
-
-    SlowCompositeReaderWrapper(CompositeReader reader) throws IOException {
-        in = reader;
-        in.registerParentReader(this);
-        if (reader.leaves().isEmpty()) {
-            metaData = new LeafMetaData(Version.LATEST.major, Version.LATEST, null);
-        } else {
-            Version minVersion = Version.LATEST;
-            for (LeafReaderContext leafReaderContext : reader.leaves()) {
-                Version leafVersion = leafReaderContext.reader().getMetaData().getMinVersion();
-                if (leafVersion == null) {
-                    minVersion = null;
-                    break;
-                } else if (minVersion.onOrAfter(leafVersion)) {
-                    minVersion = leafVersion;
-                }
-            }
-            metaData = new LeafMetaData(reader.leaves().get(0).reader().getMetaData().getCreatedVersionMajor(), minVersion, null);
-        }
-        fieldInfos = FieldInfos.getMergedFieldInfos(in);
-    }
-
-    @Override
-    public String toString() {
-        return "SlowCompositeReaderWrapper(" + in + ")";
-    }
-
-    @Override
-    public CacheHelper getReaderCacheHelper() {
-        return in.getReaderCacheHelper();
-    }
-
-    @Override
-    public CacheHelper getCoreCacheHelper() {
-        // TODO: this is trappy as the expectation is that core keys live for a long
-        // time, but here we need to bound it to the lifetime of the wrapped
-        // composite reader? Unfortunately some features seem to rely on this...
-        return in.getReaderCacheHelper();
-    }
-
-    @Override
-    public Terms terms(String field) throws IOException {
-        ensureOpen();
-        try {
-            return cachedTerms.computeIfAbsent(field, f -> {
-                try {
-                    return MultiTerms.getTerms(in, f);
-                } catch (IOException e) { // yuck!  ...sigh... checked exceptions with built-in lambdas are a pain
-                    throw new RuntimeException("unwrapMe", e);
-                }
-            });
-        } catch (RuntimeException e) {
-            if (e.getMessage().equals("unwrapMe") && e.getCause() instanceof IOException) {
-                throw (IOException) e.getCause();
-            }
-            throw e;
-        }
-    }
-
-    @Override
-    public NumericDocValues getNumericDocValues(String field) throws IOException {
-        ensureOpen();
-        return MultiDocValues.getNumericValues(in, field); // TODO cache?
-    }
-
-    @Override
-    public BinaryDocValues getBinaryDocValues(String field) throws IOException {
-        ensureOpen();
-        return MultiDocValues.getBinaryValues(in, field); // TODO cache?
-    }
-
-    @Override
-    public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
-        ensureOpen();
-        return MultiDocValues.getSortedNumericValues(in, field); // TODO cache?
-    }
-
-    @Override
-    public SortedDocValues getSortedDocValues(String field) throws IOException {
-        ensureOpen();
-        OrdinalMap map = null;
-        synchronized (cachedOrdMaps) {
-            map = cachedOrdMaps.get(field);
-            if (map == null) {
-                // uncached, or not a multi dv
-                SortedDocValues dv = MultiDocValues.getSortedValues(in, field);
-                if (dv instanceof MultiSortedDocValues) {
-                    map = ((MultiSortedDocValues)dv).mapping;
-                    CacheHelper cacheHelper = getReaderCacheHelper();
-                    if (cacheHelper != null && map.owner == cacheHelper.getKey()) {
-                        cachedOrdMaps.put(field, map);
-                    }
-                }
-                return dv;
-            }
-        }
-        int size = in.leaves().size();
-        final SortedDocValues[] values = new SortedDocValues[size];
-        final int[] starts = new int[size+1];
-        long totalCost = 0;
-        for (int i = 0; i < size; i++) {
-            LeafReaderContext context = in.leaves().get(i);
-            final LeafReader reader = context.reader();
-            final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
-            if (fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED) {
-                return null;
-            }
-            SortedDocValues v = reader.getSortedDocValues(field);
-            if (v == null) {
-                v = DocValues.emptySorted();
-            }
-            totalCost += v.cost();
-            values[i] = v;
-            starts[i] = context.docBase;
-        }
-        starts[size] = maxDoc();
-        return new MultiSortedDocValues(values, starts, map, totalCost);
-    }
-
-    @Override
-    public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
-        ensureOpen();
-        OrdinalMap map = null;
-        synchronized (cachedOrdMaps) {
-            map = cachedOrdMaps.get(field);
-            if (map == null) {
-                // uncached, or not a multi dv
-                SortedSetDocValues dv = MultiDocValues.getSortedSetValues(in, field);
-                if (dv instanceof MultiDocValues.MultiSortedSetDocValues) {
-                    map = ((MultiDocValues.MultiSortedSetDocValues)dv).mapping;
-                    CacheHelper cacheHelper = getReaderCacheHelper();
-                    if (cacheHelper != null && map.owner == cacheHelper.getKey()) {
-                        cachedOrdMaps.put(field, map);
-                    }
-                }
-                return dv;
-            }
-        }
-
-        assert map != null;
-        int size = in.leaves().size();
-        final SortedSetDocValues[] values = new SortedSetDocValues[size];
-        final int[] starts = new int[size+1];
-        long cost = 0;
-        for (int i = 0; i < size; i++) {
-            LeafReaderContext context = in.leaves().get(i);
-            final LeafReader reader = context.reader();
-            final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
-            if(fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED_SET){
-                return null;
-            }
-            SortedSetDocValues v = reader.getSortedSetDocValues(field);
-            if (v == null) {
-                v = DocValues.emptySortedSet();
-            }
-            values[i] = v;
-            starts[i] = context.docBase;
-            cost += v.cost();
-        }
-        starts[size] = maxDoc();
-        return new MultiDocValues.MultiSortedSetDocValues(values, starts, map, cost);
-    }
-
-    @Override
-    public NumericDocValues getNormValues(String field) throws IOException {
-        ensureOpen();
-        return MultiDocValues.getNormValues(in, field); // TODO cache?
-    }
-
-    @Override
-    public Fields getTermVectors(int docID) throws IOException {
-        ensureOpen();
-        return in.getTermVectors(docID);
-    }
-
-    @Override
-    public int numDocs() {
-        // Don't call ensureOpen() here (it could affect performance)
-        return in.numDocs();
-    }
-
-    @Override
-    public int maxDoc() {
-        // Don't call ensureOpen() here (it could affect performance)
-        return in.maxDoc();
-    }
-
-    @Override
-    public void document(int docID, StoredFieldVisitor visitor) throws IOException {
-        ensureOpen();
-        in.document(docID, visitor);
-    }
-
-    @Override
-    public Bits getLiveDocs() {
-        ensureOpen();
-        return MultiBits.getLiveDocs(in); // TODO cache?
-    }
-
-    @Override
-    public PointValues getPointValues(String field) {
-        ensureOpen();
-        return null; // because not supported.  Throw UOE?
-    }
-
-    @Override
-    public FieldInfos getFieldInfos() {
-        return fieldInfos;
-    }
-
-    @Override
-    protected void doClose() throws IOException {
-        // TODO: as this is a wrapper, should we really close the delegate?
-        in.close();
-    }
-
-    @Override
-    public void checkIntegrity() throws IOException {
-        ensureOpen();
-        for (LeafReaderContext ctx : in.leaves()) {
-            ctx.reader().checkIntegrity();
-        }
-    }
-
-    @Override
-    public LeafMetaData getMetaData() {
-        return metaData;
-    }
-}
\ No newline at end of file
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
deleted file mode 100644
index b63546b..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
+++ /dev/null
@@ -1,401 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Objects;
-import java.util.Set;
-
-import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
-import org.apache.commons.io.FileUtils;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.PriorityQueue;
-import org.apache.tika.eval.tokens.AnalyzerManager;
-import org.apache.tika.eval.tokens.URLEmailNormalizingFilterFactory;
-import org.apache.tika.utils.ProcessUtils;
-
-/**
- * Utility class that reads in a UTF-8 input file with one document per row
- * and outputs the 20000 tokens with the highest document frequencies.
- *
- * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
- * but includes bigrams for cjk.
- *
- * It also has a include list for __email__ and __url__ and a skip list
- * for common html markup terms.
- */
-public class TopCommonTokenCounter {
-
-    private static String LICENSE =
-            "# Licensed to the Apache Software Foundation (ASF) under one or more\n" +
-            "# contributor license agreements.  See the NOTICE file distributed with\n" +
-            "# this work for additional information regarding copyright ownership.\n" +
-            "# The ASF licenses this file to You under the Apache License, Version 2.0\n" +
-            "# (the \"License\"); you may not use this file except in compliance with\n" +
-            "# the License.  You may obtain a copy of the License at\n" +
-            "#\n" +
-            "#     http://www.apache.org/licenses/LICENSE-2.0\n" +
-            "#\n" +
-            "# Unless required by applicable law or agreed to in writing, software\n" +
-            "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" +
-            "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" +
-            "# See the License for the specific language governing permissions and\n" +
-            "# limitations under the License.\n"+
-            "#\n";
-
-    private static final String FIELD = "f";
-    private static int TOP_N = 30000;
-    private static int MIN_DOC_FREQ = 10;
-    //these should exist in every list
-    static Set<String> INCLUDE_LIST = new HashSet<>(Arrays.asList(
-            new String[] {
-                    URLEmailNormalizingFilterFactory.URL,
-                    URLEmailNormalizingFilterFactory.EMAIL
-            }
-    ));
-
-    //words to ignore
-    //these are common 4 letter html markup words that we do
-    //not want to count in case of failed markup processing.
-    //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
-    static Set<String> SKIP_LIST = new HashSet<>(Arrays.asList(
-            "span",
-            "table",
-            "href",
-            "head",
-            "title",
-            "body",
-            "html",
-            "tagname",
-            "lang",
-            "style",
-            "script",
-            "strong",
-            "blockquote",
-            "form",
-            "iframe",
-            "section",
-            "colspan",
-            "rowspan"
-    ));
-
-    public static void main(String[] args) throws Exception {
-        Path commonTokensFile = Paths.get(args[0]);
-        List<Path> inputFiles = new ArrayList<>();
-        for (int i = 1; i < args.length; i++) {
-            inputFiles.add(Paths.get(
-                    ProcessUtils.unescapeCommandLine(args[i])));
-        }
-        TopCommonTokenCounter counter = new TopCommonTokenCounter();
-        if (Files.exists(commonTokensFile)) {
-            System.err.println(commonTokensFile.getFileName().toString()+
-                    " exists. I'm skipping this.");
-            return;
-        }
-        counter.execute(commonTokensFile, inputFiles);
-    }
-
-    private void execute(Path commonTokensFile, List<Path> inputFiles) throws Exception {
-        Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
-        AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
-        long totalDocs = -1;
-        long sumDocFreqs = -1;
-        long sumTotalTermFreqs = -1;
-        long uniqueTerms = -1;
-        try (Directory directory = FSDirectory.open(luceneDir)) {
-
-            AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);
-
-            Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
-            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
-            int maxLen = 1000000;
-            int len = 0;
-            try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
-                List<Document> docs = new ArrayList<>();
-                for (Path inputFile : inputFiles) {
-                    //total hack
-                    boolean isLeipzig = false;
-                    if (inputFile.getFileName().toString().contains("-sentences.txt")) {
-                        isLeipzig = true;
-                    }
-                    int lines = 0;
-                    try (BufferedReader reader = getReader(inputFile)) {
-                        String line = reader.readLine();
-                        while (line != null) {
-                            if (isLeipzig) {
-                                int tab = line.indexOf("\t");
-                                if (tab > -1) {
-                                    line = line.substring(tab+1);
-                                }
-                            }
-                            len += line.length();
-                            Document document = new Document();
-                            document.add(new TextField(FIELD, line, Field.Store.NO));
-                            docs.add(document);
-                            if (len > maxLen) {
-                                writer.addDocuments(docs);
-                                docs.clear();
-                                len = 0;
-                            }
-                            line = reader.readLine();
-                            if (++lines % 100000 == 0) {
-                                System.out.println("processed "+lines +
-                                        " for "+inputFile.getFileName()
-                                + " :: "+ commonTokensFile.toAbsolutePath());
-                            }
-                        }
-                    }
-                }
-                if (docs.size() > 0) {
-                    writer.addDocuments(docs);
-                }
-                writer.commit();
-                writer.flush();
-            }
-
-            try (IndexReader reader = DirectoryReader.open(directory)) {
-                LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
-                totalDocs = wrappedReader.getDocCount(FIELD);
-                sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
-                sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);
-
-                Terms terms = wrappedReader.terms(FIELD);
-                TermsEnum termsEnum = terms.iterator();
-                BytesRef bytesRef = termsEnum.next();
-                int docsWThisField = wrappedReader.getDocCount(FIELD);
-                while (bytesRef != null) {
-                    uniqueTerms++;
-                    int df = termsEnum.docFreq();
-                    long tf = termsEnum.totalTermFreq();
-                    if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
-                        bytesRef = termsEnum.next();
-                        continue;
-                    }
-
-                    if (queue.top() == null || queue.size() < TOP_N ||
-                            df >= queue.top().df) {
-                        String t = bytesRef.utf8ToString();
-                        if (! SKIP_LIST.contains(t)) {
-                            queue.insertWithOverflow(new TokenDFTF(t, df, tf));
-                        }
-
-                    }
-                    bytesRef = termsEnum.next();
-                }
-            }
-        } finally {
-            FileUtils.deleteDirectory(luceneDir.toFile());
-        }
-
-        writeTopN(commonTokensFile, totalDocs,
-                sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);
-
-
-    }
-
-    private BufferedReader getReader(Path inputFile) throws IOException {
-        InputStream is = Files.newInputStream(inputFile);
-        if (inputFile.toString().endsWith(".gz")) {
-            is = new GzipCompressorInputStream(is);
-        }
-        return new BufferedReader(
-                new InputStreamReader(is, StandardCharsets.UTF_8)
-        );
-    }
-
-    private static void writeTopN(Path path,
-                                  long totalDocs, long sumDocFreqs,
-                                  long sumTotalTermFreqs,
-                                  long uniqueTerms, AbstractTokenTFDFPriorityQueue queue) throws IOException {
-        if (Files.isRegularFile(path)) {
-            System.err.println("File "+path.getFileName() + " already exists. Skipping.");
-            return;
-        }
-        Files.createDirectories(path.getParent());
-        BufferedWriter writer =
-                Files.newBufferedWriter(path, StandardCharsets.UTF_8);
-        StringBuilder sb = new StringBuilder();
-        writer.write(LICENSE);
-        writer.write("#DOC_COUNT\t"+totalDocs+"\n");
-        writer.write("#SUM_DOC_FREQS\t"+sumDocFreqs+"\n");
-        writer.write("#SUM_TERM_FREQS\t"+sumTotalTermFreqs+"\n");
-        writer.write("#UNIQUE_TERMS\t"+uniqueTerms+"\n");
-        writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
-        //add these tokens no matter what
-        for (String t : INCLUDE_LIST) {
-            writer.write(t);
-            writer.newLine();
-        }
-        for (TokenDFTF tp : queue.getArray()) {
-            writer.write(getRow(sb, tp)+"\n");
-
-        }
-        writer.flush();
-        writer.close();
-    }
-
-    private static String getRow(StringBuilder sb, TokenDFTF tp) {
-        sb.setLength(0);
-        sb.append(clean(tp.token));
-        sb.append("\t").append(tp.df);
-        sb.append("\t").append(tp.tf);
-        return sb.toString();
-    }
-
-    private static String clean(String s) {
-        if (s == null) {
-            return "";
-        }
-        return s.replaceAll("\\s+", " ").trim();
-    }
-
-    private abstract class AbstractTokenTFDFPriorityQueue extends PriorityQueue<TokenDFTF> {
-
-        AbstractTokenTFDFPriorityQueue(int maxSize) {
-            super(maxSize);
-        }
-
-        public TokenDFTF[] getArray() {
-            TokenDFTF[] topN = new TokenDFTF[size()];
-            //now we reverse the queue
-            TokenDFTF term = pop();
-            int i = topN.length-1;
-            while (term != null && i > -1) {
-                topN[i--] = term;
-                term = pop();
-            }
-            return topN;
-        }
-    }
-
-    private class TokenDFTF {
-
-        final String token;
-        final int df;
-        final long tf;
-
-        public TokenDFTF(String token, int df, long tf) {
-            this.token = token;
-            this.df = df;
-            this.tf = tf;
-        }
-
-
-        public long getTF() {
-            return tf;
-        }
-
-        public int getDF() {
-            return df;
-        }
-
-        public String getToken() {
-            return token;
-        }
-
-        @Override
-        public boolean equals(Object o) {
-            if (this == o) {
-                return true;
-            }
-            if (o == null || getClass() != o.getClass()) {
-                return false;
-            }
-
-            TokenDFTF tokenDFTF = (TokenDFTF) o;
-
-            if (df != tokenDFTF.df) {
-                return false;
-            }
-            if (tf != tokenDFTF.tf) {
-                return false;
-            }
-            return Objects.equals(token, tokenDFTF.token);
-        }
-
-        @Override
-        public int hashCode() {
-            int result = token != null ? token.hashCode() : 0;
-            result = 31 * result + df;
-            result = 31 * result + (int) (tf ^ (tf >>> 32));
-            return result;
-        }
-
-        @Override
-        public String toString() {
-            return "TokenDFTF{" +
-                    "token='" + token + '\'' +
-                    ", df=" + df +
-                    ", tf=" + tf +
-                    '}';
-        }
-    }
-
-    private class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {
-
-        TokenDFPriorityQueue(int maxSize) {
-            super(maxSize);
-        }
-
-        @Override
-        protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
-            if (arg0.df < arg1.df) {
-                return true;
-            } else if (arg0.df > arg1.df) {
-                return false;
-            }
-            return arg1.token.compareTo(arg0.token) < 0;
-        }
-
-        public TokenDFTF[] getArray() {
-            TokenDFTF[] topN = new TokenDFTF[size()];
-            //now we reverse the queue
-            TokenDFTF term = pop();
-            int i = topN.length-1;
-            while (term != null && i > -1) {
-                topN[i--] = term;
-                term = pop();
-            }
-            return topN;
-        }
-    }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/TrainTestSplit.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/TrainTestSplit.java
deleted file mode 100644
index d6a70df..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/TrainTestSplit.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.BufferedOutputStream;
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Random;
-
-import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
-
-public class TrainTestSplit {
-
-    private static String TRAINING = "train";
-    private static String TESTING = "test";
-    private static String DEVTEST = "devtest";
-
-    private float trainingP = 0.7f;
-    private float devTestP = 0.1f;
-    private float testP = 0.2f;
-    private Random random = new Random();
-
-    public static void main(String[] args) throws Exception {
-        Path leipzigDir = Paths.get(args[0]);
-        Path outputDir = Paths.get(args[1]);
-        TrainTestSplit splitter = new TrainTestSplit();
-        splitter.execute(leipzigDir, outputDir);
-    }
-
-    private void execute(Path leipzigDir, Path outputDir) throws Exception {
-        initOutDirs(outputDir);
-        for (File f : leipzigDir.toFile().listFiles()) {
-            if (f.isDirectory()) {
-                continue;
-            }
-            processFile(f, outputDir);
-        }
-    }
-
-    private void initOutDirs(Path outputDir) throws Exception {
-        for (String which : new String[] { TRAINING, DEVTEST, TESTING}) {
-            Path target = outputDir.resolve(which);
-            if (! Files.isDirectory(target)) {
-                Files.createDirectories(target);
-            }
-        }
-
-    }
-
-    private void processFile(File f, Path outputDir) throws Exception {
-        Map<String, BufferedWriter> writers = getWriters(outputDir, f);
-        System.err.println("working on "+f);
-        try (BufferedReader reader = Files.newBufferedReader(f.toPath(), StandardCharsets.UTF_8)) {
-            String line = reader.readLine();
-            while (line != null) {
-                float r = random.nextFloat();
-                if (r <= trainingP) {
-                    writers.get(TRAINING).write(line + "\n");
-                } else if (r < trainingP + devTestP) {
-                    writers.get(DEVTEST).write(line + "\n");
-                } else {
-                    writers.get(TESTING).write(line+"\n");
-                }
-                line = reader.readLine();
-            }
-        }
-
-
-        for (Writer w : writers.values()) {
-            w.flush();
-            w.close();
-        }
-    }
-
-    private Map<String, BufferedWriter> getWriters(Path outputDir, File f) throws IOException {
-        Map<String, BufferedWriter> writers = new HashMap<>();
-        for (String which : new String[] { TRAINING, DEVTEST, TESTING}) {
-            writers.put(which, getWriter(outputDir, which, f));
-        }
-        return writers;
-    }
-
-    private BufferedWriter getWriter(Path outputDir, String which, File f) throws IOException {
-        OutputStream os = new GzipCompressorOutputStream(
-                new BufferedOutputStream(
-                Files.newOutputStream(outputDir.resolve(which).resolve(f.getName()+".gz"))));
-        return new BufferedWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8));
-    }
-}
diff --git a/tika-eval/src/main/resources/comparison-reports-pg.xml b/tika-eval/src/main/resources/comparison-reports-pg.xml
deleted file mode 100644
index 5bcf88e..0000000
--- a/tika-eval/src/main/resources/comparison-reports-pg.xml
+++ /dev/null
@@ -1,1753 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
-
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-
-<reports>
-
-
-    <before>
-
-        <sql>drop table if exists md5_multiples_tmp_a</sql>
-        <sql>create table md5_multiples_tmp_a (MD5, cnt)
-            as
-            select md5, count(1) as cnt
-            from profiles_a
-            where md5 is not null
-            group by md5
-            having count(1) &gt; 1
-            order by cnt desc
-        </sql>
-
-        <sql>drop table if exists md5_multiples_tmp_b</sql>
-        <sql>create table md5_multiples_tmp_b (MD5, cnt)
-            as
-            select md5, count(1) cnt
-            from profiles_b
-            where md5 is not null
-            group by md5
-            having count(1) &gt; 1
-            order by cnt desc
-        </sql>
-        <!-- build mime indexes -->
-
-        <sql>create index if not exists pa_m_idx
-            on profiles_a (mime_id);
-        </sql>
-
-        <sql>
-            create index if not exists pb_m_idx
-            on profiles_b (mime_id);
-        </sql>
-
-        <!-- build exceptions comparison table -->
-        <sql>drop table if exists exceptions_compared</sql>
-        <sql>
-            create table exceptions_compared (
-            mime_id_a integer,
-            mime_id_b integer,
-            total integer,
-            exc_cnt_a integer,
-            exc_cnt_b integer,
-            exc_prcnt_a float,
-            exc_prcnt_b float,
-            notes varchar(12)
-            );
-        </sql>
-        <sql>
-            insert into exceptions_compared (
-            select ma.mime_id, mb.mime_id, count(1) as total, 0, 0, 0.0, 0.0, ''
-            from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            join mimes ma on pa.mime_id = ma.mime_id
-            join mimes mb on pb.mime_id = mb.mime_id
-            group by ma.mime_id, mb.mime_id
-            order by total desc );
-        </sql>
-
-        <sql>
-            update exceptions_compared ec set
-            exc_cnt_a = (
-            select count(1) as cnt
-            from exceptions_a ea
-            join profiles_a pa on ea.id=pa.id
-            join profiles_b pb on pb.id=pa.id
-            join mimes ma on pa.mime_id=ma.mime_id
-            join mimes mb on pb.mime_id=mb.mime_id
-            where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
-            group by ma.mime_id, mb.mime_id);
-        </sql>
-        <sql>
-            update exceptions_compared ec set
-            exc_cnt_b = (
-            select count(1) as cnt
-            from exceptions_b eb
-            join profiles_b pb on eb.id=pb.id
-            join profiles_a pa on pa.id=pb.id
-            join mimes ma on pa.mime_id=ma.mime_id
-            join mimes mb on pb.mime_id=mb.mime_id
-            where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
-            group by mb.mime_id, ma.mime_id);
-        </sql>
-        <sql>
-            update exceptions_compared
-            set exc_prcnt_a = cast(exc_cnt_a as decimal)/cast(total as decimal)
-            where total > 0;
-        </sql>
-        <sql>
-            update exceptions_compared
-            set exc_prcnt_b = cast(exc_cnt_b as decimal)/cast(total as decimal)
-            where total > 0;
-        </sql>
-
-        <sql>
-            update exceptions_compared
-            set notes = 'YAY!'
-            where total > 100 and (exc_prcnt_a-exc_prcnt_b) > 0.10;
-        </sql>
-        <sql>
-            update exceptions_compared
-            set notes = 'YIKES!'
-            where total > 100 and (exc_prcnt_b-exc_prcnt_a) > 0.10;
-        </sql>
-
-        <!-- build tmp common words table -->
-        <sql>drop table if exists token_counts_compared</sql>
-        <sql>
-            create table token_counts_compared
-            (mime_id_a integer,
-            mime_id_b integer,
-            num_tokens_a bigint default 0,
-            num_tokens_b bigint default 0,
-            num_alphabetic_tokens_a bigint default 0,
-            num_alphabetic_tokens_b bigint default 0,
-            num_common_tokens_a bigint default 0,
-            num_common_tokens_b bigint default 0
-            );
-        </sql>
-        <sql>
-            insert into token_counts_compared (mime_id_a, mime_id_b)
-            select ma.mime_id, mb.mime_id
-            from profiles_a a
-            join profiles_b b on a.id=b.id
-            join mimes ma on ma.mime_id=a.mime_id
-            join mimes mb on mb.mime_id=b.mime_id
-            group by ma.mime_id, mb.mime_id
-
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_tokens_a=(
-            select sum(num_tokens) as cnt from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            join contents_a c on c.id = pa.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_tokens_b=(
-            select sum(num_tokens) as cnt from profiles_b pb
-            join profiles_a pa on pa.id=pb.id
-            join contents_b c on c.id = pb.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_alphabetic_tokens_a=(
-            select sum(num_alphabetic_tokens) as cnt from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            join contents_a c on c.id = pa.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_alphabetic_tokens_b=(
-            select sum(num_alphabetic_tokens) as cnt from profiles_b pb
-            join profiles_a pa on pb.id=pa.id
-            join contents_b c on c.id = pb.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_common_tokens_a=(
-            select sum(num_common_tokens) as cnt from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            join contents_a c on c.id = pa.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_common_tokens_b=(
-            select sum(num_common_tokens) as cnt from profiles_b pb
-            join profiles_a pa on pa.id=pb.id
-            join contents_b c on c.id = pb.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>drop table if exists tags_by_mime</sql>
-        <sql>create table tags_by_mime (
-                mime_id_a integer,
-                mime_id_b integer,
-                tags_a_a integer,
-                tags_b_a integer,
-                tags_div_a integer,
-                tags_i_a integer,
-                tags_img_a integer,
-                tags_li_a integer,
-                tags_ol_a integer,
-                tags_p_a integer,
-                tags_table_a integer,
-                tags_td_a integer,
-                tags_title_a integer,
-                tags_tr_a integer,
-                tags_u_a integer,
-                tags_ul_a integer,
-                tags_a_b integer,
-                tags_b_b integer,
-                tags_div_b integer,
-                tags_i_b integer,
-                tags_img_b integer,
-                tags_li_b integer,
-                tags_ol_b integer,
-                tags_p_b integer,
-                tags_table_b integer,
-                tags_td_b integer,
-                tags_title_b integer,
-                tags_tr_b integer,
-                tags_u_b integer,
-                tags_ul_b integer
-            );
-        </sql>
-        <sql>
-            insert into tags_by_mime (mime_id_a, mime_id_b)
-            select ma.mime_id, mb.mime_id
-            from profiles_a a
-            join profiles_b b on a.id=b.id
-            join mimes ma on ma.mime_id=a.mime_id
-            join mimes mb on mb.mime_id=b.mime_id
-            group by ma.mime_id, mb.mime_id
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_a_a=(
-            select sum(ta.tags_a) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_b_a=(
-            select sum(ta.tags_b) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_div_a=(
-            select sum(ta.tags_div) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_i_a=(
-            select sum(ta.tags_i) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_img_a=(
-            select sum(ta.tags_img) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_li_a=(
-            select sum(ta.tags_li) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_ol_a=(
-            select sum(ta.tags_ol) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_p_a=(
-            select sum(ta.tags_p) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_table_a=(
-            select sum(ta.tags_table) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_td_a=(
-            select sum(ta.tags_td) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_title_a=(
-            select sum(ta.tags_title) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_tr_a=(
-            select sum(ta.tags_tr) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_u_a=(
-            select sum(ta.tags_u) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_ul_a=(
-            select sum(ta.tags_ul) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <!-- now update tags_b counts -->
-        <sql>
-            update tags_by_mime tbm set tags_a_b=(
-            select sum(tb.tags_a) as cnt from tags_b tb
-            join tags_a ta on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_b_b=(
-            select sum(tb.tags_b) as cnt from tags_b tb
-            join tags_a ta on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_div_b=(
-            select sum(tb.tags_div) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_i_b=(
-            select sum(tb.tags_i) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_img_b=(
-            select sum(tb.tags_img) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_li_b=(
-            select sum(tb.tags_li) as cnt from tags_b tb
-            join tags_a ta on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_ol_b=(
-            select sum(tb.tags_ol) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_p_b=(
-            select sum(tb.tags_p) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_table_b=(
-            select sum(tb.tags_table) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_td_b=(
-            select sum(tb.tags_td) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_title_b=(
-            select sum(tb.tags_title) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_tr_b=(
-            select sum(tb.tags_tr) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_u_b=(
-            select sum(tb.tags_u) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_ul_b=(
-            select sum(tb.tags_ul) as cnt from tags_b tb
-            join tags_a ta on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>drop table if exists tag_exceptions_by_mime</sql>
-        <sql>create table tag_exceptions_by_mime (
-            mime_id_a integer,
-            mime_id_b integer,
-            tag_exceptions_a integer,
-            tag_exceptions_b integer)
-        </sql>
-        <sql>
-            insert into tag_exceptions_by_mime (mime_id_a, mime_id_b,
-                tag_exceptions_a, tag_exceptions_b)
-            select ma.mime_id, mb.mime_id,0,0
-            from profiles_a a
-            join profiles_b b on a.id=b.id
-            join mimes ma on ma.mime_id=a.mime_id
-            join mimes mb on mb.mime_id=b.mime_id
-            group by ma.mime_id, mb.mime_id
-        </sql>
-        <sql>
-            update tag_exceptions_by_mime tebm set tag_exceptions_a=(
-            select count(1) as cnt from tags_a ta
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tebm.mime_id_b
-            and pa.mime_id=tebm.mime_id_a
-            and ta.tags_parse_exception=true
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tag_exceptions_by_mime tebm set tag_exceptions_b=(
-            select count(1) as cnt from tags_b tb
-            join profiles_a pa on pa.id=tb.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tebm.mime_id_b
-            and pa.mime_id=tebm.mime_id_a
-            and tb.tags_parse_exception=true
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            drop table if exists parse_time_compared;
-        </sql>
-        <sql>
-            create table parse_time_compared (
-            mime_id_a integer,
-            mime_id_b integer,
-            total_a bigint,
-            total_b bigint,
-            prcnt_increase double precision
-            );
-        </sql>
-            <sql>
-                insert into parse_time_compared (mime_id_a, mime_id_b,
-                total_a, total_b, prcnt_increase)
-                select ma.mime_id, mb.mime_id,0,0,0.0
-                from profiles_a a
-                join profiles_b b on a.id=b.id
-                join mimes ma on ma.mime_id=a.mime_id
-                join mimes mb on mb.mime_id=b.mime_id
-                group by ma.mime_id, mb.mime_id
-            </sql>
-        <sql>
-            update parse_time_compared ptc set total_a=(
-            select sum(pa.elapsed_time_millis) as total_a from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            where pa.mime_id= ptc.mime_id_a
-            and pb.mime_id=ptc.mime_id_b
-            group by mime_id_a, mime_id_b)
-        </sql>
-        <sql>
-            update parse_time_compared ptc set total_b=(
-            select sum(pb.elapsed_time_millis) as total_b from profiles_b pb
-            join profiles_a pa on pa.id=pb.id
-            where pa.mime_id= ptc.mime_id_a
-            and pb.mime_id=ptc.mime_id_b
-            group by mime_id_a, mime_id_b)
-        </sql>
-        <sql>
-            update parse_time_compared ptc set prcnt_increase=(100.0 *
-            cast(total_b as decimal)/cast(total_a as decimal))
-            where total_a > 0;
-        </sql>
-    </before>
-
-    <!-- MIMES -->
-    <report reportName="All Mimes In A"
-            reportFilename="mimes/all_mimes_A.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) cnt from
-            profiles_a p
-            join mimes m on m.mime_id = p.mime_id
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-
-    <report reportName="All Mimes In B"
-            reportFilename="mimes/all_mimes_B.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) cnt from
-            profiles_b p
-            join mimes m on m.mime_id = p.mime_id
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-    <report reportName="Container Mimes In A"
-            reportFilename="mimes/container_mimes_A.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) cnt from
-            profiles_a p
-            join mimes m on m.mime_id = p.mime_id
-            where is_embedded=false
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-
-    <report reportName="Container Mimes In B"
-            reportFilename="mimes/container_mimes_B.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) cnt from
-            profiles_b p
-            join mimes m on m.mime_id = p.mime_id
-            where is_embedded=false
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-    <report reportName="Embedded Mimes In A"
-            reportFilename="mimes/embedded_mimes_A.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) cnt from
-            profiles_a p
-            join mimes m on m.mime_id = p.mime_id
-            where is_embedded=true
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-
-    <report reportName="Embedded Mimes In B"
-            reportFilename="mimes/embedded_mimes_B.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) cnt from
-            profiles_b p
-            join mimes m on m.mime_id = p.mime_id
-            where is_embedded=true
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-    <report reportName="Mime Differences A -> B"
-            reportFilename="mimes/mime_diffs_A_to_B.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select concat(ma.mime_string, ' -&gt; ', mb.mime_string) as
-            MIME_A_TO_MIME_B, count(1) as COUNT
-            from profiles_a a
-            join profiles_b b on a.id=b.id
-            join mimes ma on ma.mime_id=a.mime_id
-            join mimes mb on mb.mime_id=b.mime_id
-            where a.mime_id &lt;&gt; b.mime_id
-            group by MIME_A_TO_MIME_B
-            order by COUNT DESC
-        </sql>
-    </report>
-
-    <report reportName="Mime Differences A -> B Details"
-            reportFilename="mimes/mime_diffs_A_to_B_details.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select concat(ma.mime_string, ' -&gt; ', mb.mime_string) as
-            MIME_A_TO_MIME_B,
-            file_path,
-            c.length as CONTAINER_LENGTH,
-            a.file_name
-            from profiles_a a
-            join profiles_b b on a.id=b.id
-            join mimes ma on ma.mime_id=a.mime_id
-            join mimes mb on mb.mime_id=b.mime_id
-            join containers c on a.container_id=c.container_id
-            where a.mime_id &lt;&gt; b.mime_id
-            order by MIME_A_TO_MIME_B
-        </sql>
-    </report>
-
-
-    <!-- Exceptions -->
-    <report reportName="AllExceptionsByMimeA"
-            reportFilename="exceptions/exceptions_by_mime_A.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) cnt from
-            exceptions_a e
-            join profiles_a p on p.id=e.id
-            join mimes m on m.mime_id = p.mime_id
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-    <report reportName="AllExceptionsByMimeB"
-            reportFilename="exceptions/exceptions_by_mime_B.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) cnt from
-            exceptions_b e
-            join profiles_b p on p.id=e.id
-            join mimes m on m.mime_id = p.mime_id
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-
-    <report reportName="ContainerExceptionsByMimeA"
-            reportFilename="exceptions/container_exceptions_by_mime_A.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) cnt from
-            exceptions_a e
-            join profiles_a p on p.id=e.id
-            join mimes m on m.mime_id = p.mime_id
-            where is_embedded=false
-            and parse_exception_id=0
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-
-    <report reportName="ContainerExceptionsByMimeB"
-            reportFilename="exceptions/container_exceptions_by_mime_B.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) cnt from
-            exceptions_b e
-            join profiles_b p on p.id=e.id
-            join mimes m on m.mime_id = p.mime_id
-            where is_embedded=false
-            and parse_exception_id=0
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-    <report reportName="AllExceptionsByMimeByTypeA"
-            reportFilename="exceptions/exceptions_by_mime_by_type_A.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string as MIME_TYPE,
-            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
-            from exceptions_a e
-            join profiles_a p on p.id=e.id
-            join containers c on p.container_id=c.container_id
-            join mimes m on m.mime_id=p.mime_id
-            join ref_parse_exception_types r on
-            r.parse_exception_id=e.parse_exception_id
-            group by m.mime_string, parse_exception_description
-            order by MIME_TYPE, EXCEPTION_TYPE
-        </sql>
-    </report>
-
-    <report reportName="AllExceptionsByMimeByTypeB"
-            reportFilename="exceptions/exceptions_by_mime_by_type_B.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string as MIME_TYPE,
-            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
-            from exceptions_b e
-            join profiles_b p on p.id=e.id
-            join containers c on p.container_id=c.container_id
-            join mimes m on m.mime_id=p.mime_id
-            join ref_parse_exception_types r on
-            r.parse_exception_id=e.parse_exception_id
-            group by m.mime_string, parse_exception_description
-            order by MIME_TYPE, EXCEPTION_TYPE
-        </sql>
-    </report>
-
-    <report reportName="TextLostFromACausedByNewExceptionsInB"
-            reportFilename="exceptions/text_lost_from_A_caused_by_new_exceptions_in_B.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select file_path as FILE_PATH,
-            c.length as CONTAINER_LENGTH,
-            ca.NUM_TOKENS as NUM_TOKENS_A,
-            cb.NUM_TOKENS as NUM_TOKENS_B,
-            ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A, cb.NUM_UNIQUE_TOKENS
-            as NUM_UNIQUE_TOKENS_B,
-            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
-            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
-            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
-            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
-            ca.top_n_tokens as TOP_N_TOKENS_A, cb.top_n_tokens as TOP_N_TOKENS_B,
-            eb.ORIG_STACK_TRACE as ORIG_STACK_TRACE_B
-            from contents_a ca
-            join profiles_a pa on ca.id = pa.id
-            join containers c on pa.container_id=c.container_id
-            left join contents_b cb on ca.id=cb.id
-            left join exceptions_b eb on ca.id = eb.id
-            left join exceptions_a ea on ca.id = ea.id
-            where eb.orig_stack_trace is not null
-            and ea.orig_stack_trace is null
-            order by ca.num_common_tokens - coalesce(cb.num_common_tokens,0) desc
-        </sql>
-    </report>
-
-    <report reportName="FixedExceptionsInBByMimeType"
-            reportFilename="exceptions/fixed_exceptions_in_B_by_mime.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select
-            ma.mime_string as MIME_TYPE_A,
-            mb.mime_string as MIME_TYPE_B,
-            count(1) as COUNT
-            from exceptions_a ea
-            left join exceptions_b eb on ea.id = eb.id
-            join profiles_a pa on pa.id=ea.id
-            join profiles_b pb on pa.id=pb.id
-            join containers c on pa.container_id=c.container_id
-            join mimes ma on ma.mime_id=pa.mime_id
-            join mimes mb on mb.mime_id=pb.mime_id
-            where eb.id is null
-            and ea.parse_exception_id=0
-            group by mime_type_a, mime_type_b
-        </sql>
-    </report>
-
-    <report reportName="FixedExceptionsInByDetails"
-            reportFilename="exceptions/fixed_exceptions_in_B_details.xlsx"
-            format="xlsx"
-            includeSql="true">
-        <sql>
-            select
-            file_path,
-            c.length as CONTAINER_LENGTH,
-            ma.mime_string as MIME_TYPE_A,
-            mb.mime_string as MIME_TYPE_B,
-            pa.file_name, pa.is_embedded
-            from exceptions_a ea
-            left join exceptions_b eb on ea.id = eb.id
-            join profiles_a pa on pa.id=ea.id
-            join profiles_b pb on pb.id=pa.id --this ensures that files were actually processed in both runs
-            join containers c on pa.container_id=c.container_id
-            join mimes ma on ma.mime_id=pa.mime_id
-            join mimes mb on mb.mime_id=pb.mime_id
-            where eb.id is null
-            and ea.parse_exception_id=0
-            order by mime_type_a, mime_type_b
-        </sql>
-    </report>
-    <report reportName="ContentsOfFixedExceptionsInB"
-            reportFilename="exceptions/contents_of_fixed_exceptions_in_B.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select file_path,
-            c.length as CONTAINER_LENGTH,
-            ma.mime_string as MIME_TYPE_A,
-            mb.mime_string as MIME_TYPE_B,
-            CONTENT_LENGTH,
-            NUM_TOKENS, NUM_UNIQUE_TOKENS,
-            TOP_N_TOKENS, LANG_ID_1,TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV
-            from exceptions_a ea
-            left join exceptions_b eb on ea.id = eb.id
-            join profiles_a pa on pa.id=ea.id
-            join profiles_b pb on pa.id=pb.id
-            join contents_b cb on cb.id=ea.id
-            join containers c on pa.container_id=c.container_id
-            join mimes ma on ma.mime_id=pa.mime_id
-            join mimes mb on mb.mime_id=pb.mime_id
-            where eb.id is null
-            and ea.parse_exception_id=0
-        </sql>
-    </report>
-
-    <report reportName="NewExceptionsByMimeType"
-            reportFilename="exceptions/new_exceptions_in_B_by_mime.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select ma.mime_string as MIME_TYPE_A, mb.mime_string as MIME_TYPE_B, count(1) as COUNT
-            from exceptions_b eb
-            left join exceptions_a ea on ea.id = eb.id
-            join profiles_a pa on pa.id=eb.id
-            join profiles_b pb on pb.id=pa.id
-            join containers c on pa.container_id=c.container_id
-            join mimes ma on ma.mime_id=pa.mime_id
-            join mimes mb on mb.mime_id=pb.mime_id
-            where ea.id is null
-            and eb.parse_exception_id=0
-            group by ma.mime_string, mb.mime_string
-            order by COUNT desc
-        </sql>
-    </report>
-
-    <report reportName="NewExceptionsInBByMimeTypeByStackTrace"
-            reportFilename="exceptions/new_exceptions_in_B_by_mime_by_stack_trace.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select
-            ma.MIME_STRING as MIME_TYPE_A,
-            mb.MIME_STRING as MIME_TYPE_B,
-            eb.sort_stack_trace, count(1) as
-            COUNT
-            from exceptions_b eb
-            left join exceptions_a ea on ea.id = eb.id
-            join profiles_a pa on pa.id=eb.id
-            join profiles_b pb on pb.id=eb.id
-            join mimes ma on ma.mime_id=pa.mime_id
-            join mimes mb on mb.mime_id=pb.mime_id
-            where ea.id is null
-            and eb.parse_exception_id=0
-            group by MIME_TYPE_A, MIME_TYPE_B, eb.sort_stack_trace
-            order by MIME_TYPE_A asc, MIME_TYPE_B asc, COUNT desc
-        </sql>
-    </report>
-
-    <report reportName="NewExceptionsInBDetails"
-            reportFilename="exceptions/new_exceptions_in_B_details.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select file_path,
-            c.length as CONTAINER_LENGTH,
-            ma.mime_string as MIME_TYPE_A,
-            mb.mime_string as MIME_TYPE_B,
-            eb.orig_stack_trace, eb.sort_stack_trace
-            from exceptions_b eb
-            left join exceptions_a ea on ea.id = eb.id
-            join profiles_a pa on pa.id=eb.id
-            join profiles_b pb on pb.id=eb.id
-            join containers c on pa.container_id=c.container_id
-            join mimes ma on ma.mime_id=pa.mime_id
-            join mimes mb on mb.mime_id=pb.mime_id
-            where ea.id is null
-            and eb.parse_exception_id=0
-            order by MIME_TYPE_A asc, MIME_TYPE_B asc, eb.ORIG_STACK_TRACE
-        </sql>
-    </report>
-
-    <report reportName="StackTracesByMimeInA"
-            reportFilename="exceptions/stack_traces_by_mime_A.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
-            COUNT
-            from exceptions_a e
-            join profiles_a p on p.id=e.id
-            join mimes m on m.mime_id=p.mime_id
-            and e.parse_exception_id=0
-            group by MIME_TYPE, e.sort_stack_trace
-            order by MIME_TYPE asc, COUNT desc
-        </sql>
-    </report>
-
-    <report reportName="AllStackTracesInA"
-            reportFilename="exceptions/stack_traces_A.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select file_path,
-            c.length as CONTAINER_LENGTH,
-            mime_string as MIME_TYPE,
-            orig_stack_trace, sort_stack_trace
-            from exceptions_a e
-            join profiles_a p on p.id=e.id
-            join containers c on p.container_id=c.container_id
-            join mimes m on m.mime_id=p.mime_id
-            and e.parse_exception_id=0
-            order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
-            CONTAINER_LENGTH asc
-        </sql>
-    </report>
-    <report reportName="AllStackTracesInB"
-            reportFilename="exceptions/stack_traces_B.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select file_path,
-            c.length as CONTAINER_LENGTH,
-            mime_string as MIME_TYPE,
-            orig_stack_trace, sort_stack_trace
-            from exceptions_b e
-            join profiles_b p on p.id=e.id
-            join containers c on p.container_id=c.container_id
-            join mimes m on m.mime_id=p.mime_id
-            and e.parse_exception_id=0
-            order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
-            CONTAINER_LENGTH asc
-        </sql>
-    </report>
-
-    <report reportName="StackTracesByMimeInB"
-            reportFilename="exceptions/stack_traces_by_mime_B.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
-            COUNT
-            from exceptions_b e
-            join profiles_b p on p.id=e.id
-            join mimes m on m.mime_id=p.mime_id
-            and e.parse_exception_id=0
-            group by MIME_TYPE, e.sort_stack_trace
-            order by MIME_TYPE asc, COUNT desc
-        </sql>
-    </report>
-    <report reportName="extractExceptionsA"
-            reportFilename="exceptions/extract_exceptions_a.xlsx"
-            format="xlsx"
-            includeSql="true">
-        <sql>
-            select file_path, extract_exception_description
-            from extract_exceptions_a e
-            join ref_extract_exception_types t
-            on e.extract_exception_id=t.extract_exception_id
-        </sql>
-    </report>
-    <report reportName="extractExceptionsB"
-            reportFilename="exceptions/extract_exceptions_b.xlsx"
-            format="xlsx"
-            includeSql="true">
-        <sql>
-            select file_path, extract_exception_description
-            from extract_exceptions_b e
-            join ref_extract_exception_types t
-            on e.extract_exception_id=t.extract_exception_id
-        </sql>
-    </report>
-    <report reportName="parseExceptionTypesA"
-            reportFilename="exceptions/overall_exception_types_a.xlsx"
-            format="xlsx"
-            includeSql="true">
-        <sql>
-            select parse_exception_description, count(1)
-            from exceptions_a e
-            join ref_parse_exception_types t on
-            t.parse_exception_id=e.parse_exception_id
-            group by t.parse_exception_description
-        </sql>
-    </report>
-    <report reportName="parseExceptionTypesB"
-            reportFilename="exceptions/overall_exception_types_b.xlsx"
-            format="xlsx"
-            includeSql="true">
-        <sql>
-            select parse_exception_description, count(1)
-            from exceptions_b e
-            join ref_parse_exception_types t on
-            t.parse_exception_id=e.parse_exception_id
-            group by t.parse_exception_description
-        </sql>
-    </report>
-
-    <report reportName="contentDiffsWExceptions"
-            reportFilename="content/content_diffs_with_exceptions.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select file_path,
-            c.length as CONTAINER_LENGTH,
-            ma.mime_string as MIME_STRING_A,
-            mb.mime_string as MIME_STRING_B,
-            ca.num_unique_tokens as NUM_UNIQUE_TOKENS_A,
-            cb.num_unique_tokens as NUM_UNIQUE_TOKENS_B,
-            ca.num_tokens as NUM_TOKENS_A,
-            cb.num_tokens as NUM_TOKENS_B,
-            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
-            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
-            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
-            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
-            coalesce(cb.num_common_tokens,0)-
-            coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
-            ca.top_n_tokens as TOP_N_TOKENS_A,
-            cb.top_n_tokens as TOP_N_TOKENS_B,
-            ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
-            cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
-            top_10_unique_token_diffs_a,
-            top_10_unique_token_diffs_b,
-            top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap,
-            ref_ea.parse_exception_description as EXCEPTION_A,
-            ref_eb.parse_exception_description as EXCEPTION_B
-            from content_comparisons cc
-            join contents_a ca on ca.id=cc.id
-            left join contents_b cb on cb.id=cc.id
-            join profiles_a pa on pa.id = cc.id
-            join profiles_b pb on pb.id=cc.id
-            join containers c on c.container_id=pa.container_id
-            join mimes ma on ma.mime_id=pa.mime_id
-            join mimes mb on mb.mime_id=pb.mime_id
-            left join exceptions_a ea on ea.id=cc.id
-            left join exceptions_b eb on eb.id=cc.id
-            left join ref_parse_exception_types ref_ea on ref_ea.parse_exception_id=ea.parse_exception_id
-            left join ref_parse_exception_types ref_eb on ref_eb.parse_exception_id=eb.parse_exception_id
-            where (overlap &lt; 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) &gt;30)
-            and (ea.parse_exception_id is null or
-            ea.parse_exception_id &lt;&gt; 2)
-            and (eb.parse_exception_id is null or
-            eb.parse_exception_id &lt;&gt; 2)
-            order by ma.mime_string, overlap asc
-            limit 100000
-        </sql>
-    </report>
-    <report reportName="contentDiffsNoExceptions"
-            reportFilename="content/content_diffs_no_exceptions.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select file_path,
-            c.length as CONTAINER_LENGTH,
-            ma.mime_string as MIME_STRING_A,
-            mb.mime_string as MIME_STRING_B,
-            ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A,
-            cb.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_B,
-            ca.NUM_TOKENS as NUM_TOKENS_A,
-            cb.NUM_TOKENS as NUM_TOKENS_B,
-            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
-            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
-            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
-            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
-            coalesce(cb.num_common_tokens,0)-
-            coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
-            ca.top_n_tokens as TOP_N_TOKENS_A,
-            cb.top_n_tokens as TOP_N_TOKENS_B,
-            ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
-            cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
-            top_10_unique_token_diffs_a,
-            top_10_unique_token_diffs_b,
-            top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap
-            from content_comparisons cc
-            join contents_a ca on ca.id=cc.id
-            join contents_b cb on cb.id=cc.id
-            join profiles_a pa on pa.id = cc.id
-            join profiles_b pb on pb.id=cc.id
-            join containers c on c.container_id=pa.container_id
-            join mimes ma on ma.mime_id=pa.mime_id
-            join mimes mb on mb.mime_id=pb.mime_id
-            left join exceptions_a ea on ea.id=cc.id
-            left join exceptions_b eb on eb.id=cc.id
-            where (overlap &lt; 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) &gt;30)
-            and (ea.parse_exception_id is null)
-            and (eb.parse_exception_id is null)
-            order by ma.mime_string, overlap asc
-            limit 100000
-        </sql>
-    </report>
-
-    <report reportName="CommonTokenComparisonsByMimeType"
-            reportFilename="content/common_token_comparisons_by_mime.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select ma.mime_string as MIME_STRING_A, mb.mime_string as MIME_STRING_B,
-            num_tokens_a, num_tokens_b,
-            num_alphabetic_tokens_a, num_alphabetic_tokens_b,
-            num_common_tokens_a, num_common_tokens_b,
-            coalesce(num_common_tokens_b, 0)-coalesce(num_common_tokens_a, 0) as change_in_common_tokens_b
-            from token_counts_compared tcc
-            join mimes ma on tcc.mime_id_a = ma.mime_id
-            join mimes mb on tcc.mime_id_b = mb.mime_id
-            order by change_in_common_tokens_b desc
-        </sql>
-    </report>
-    <report reportName="PageCountDiffs"
-            reportFilename="content/page_count_diffs.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select file_path,
-            c.length as CONTAINER_LENGTH,
-            ma.mime_string as MIME_STRING_A,
-            mb.mime_string as MIME_STRING_B,
-            pa.num_pages as NUM_PAGES_A,
-            pb.num_pages as NUM_PAGES_B,
-            (pb.num_pages-pa.num_pages) as DIFF_NUM_PAGES_IN_B
-            from profiles_a pa
-            join profiles_b pb on pa.id = pb.id
-            join containers c on pa.container_id=c.container_id
-            join mimes ma on ma.mime_id=pa.mime_id
-            join mimes mb on mb.mime_id=pb.mime_id
-            where pa.num_pages is not null
-            and pb.num_pages is not null
-            and pa.num_pages &lt;&gt; pb.num_pages
-            order by DIFF_NUM_PAGES_IN_B asc
-            limit 10000;
-        </sql>
-    </report>
-
-
-    <report reportName="ExceptionComparisonsByMimeType"
-            reportFilename="exceptions/exceptions_compared_by_mime_type.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select ma.mime_string as mime_string_a, mb.mime_string as mime_string_b,
-            total, exc_cnt_a,
-            exc_cnt_b,
-            exc_prcnt_a,
-            exc_prcnt_b, notes
-
-            from exceptions_compared e
-            join mimes ma on ma.mime_id=e.mime_id_a
-            join mimes mb on mb.mime_id=e.mime_id_b
-            order by (exc_prcnt_b-exc_prcnt_a) desc, total desc;
-        </sql>
-    </report>
-    <!--    <report reportName="MD5 Duplicate Counts A"
-                reportFilename="md5/md5_duplicate_counts_A.xlsx"
-                format="xlsx"
-                            includeSql="true">
-            <sql>
-                select md5, count(1) cnt
-                from profiles_a
-                group by md5
-                having cnt > 2
-                order by cnt desc
-            </sql>
-        </report>
-
-        <report reportName="MD5 Duplicate Counts B"
-                reportFilename="md5/md5_duplicate_counts_B.xlsx"
-                format="xlsx"
-                            includeSql="true">
-
-            <sql>
-                select md5, count(1) cnt
-                from profiles_b
-                group by md5
-                having cnt > 2
-                order by cnt desc
-            </sql>
-        </report>
-
-        <report reportName="MD5 Duplicates A"
-                reportFilename="md5/md5_duplicates_A.xlsx"
-                format="xlsx"
-                            includeSql="true">
-
-            <sql>
-                select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5
-                from md5_multiples_tmp_a t
-                join profiles_a p on p.md5 = t.md5
-                join containers c on p.container_id = c.container_id
-                join contents_a cb on p.id=cb.id
-                order by t.cnt desc
-            </sql>
-        </report>
-
-        <report reportName="MD5 Duplicates B"
-                reportFilename="md5/md5_duplicates_B.xlsx"
-                format="xlsx"
-                            includeSql="true">
-
-            <sql>
-                select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5
-                from md5_multiples_tmp_b t
-                join profiles_b p on p.md5 = t.md5
-                join containers c on p.container_id = c.container_id
-                join contents_b cb on p.id=cb.id
-                order by t.cnt desc
-            </sql>
-        </report>
-    -->
-
-    <report reportName="Attachment Diffs no Exceptions"
-            reportFilename="attachments/attachment_diffs_no_exceptions.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select file_path,
-            c.length as CONTAINER_LENGTH,
-            ma.mime_string as MIME_STRING_A,
-            mb.mime_string as MIME_STRING_B,
-            pa.num_attachments as NUM_ATTACHMENTS_A,
-            pb.num_attachments as NUM_ATTACHMENTS_B,
-            pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B
-            from profiles_a pa
-            join profiles_b pb on pa.id= pb.id
-            join containers c on pa.container_id=c.container_id
-            join mimes ma on pa.mime_id=ma.mime_id
-            join mimes mb on pb.mime_id=mb.mime_id
-            left join exceptions_a ea on ea.id=pa.id
-            left join exceptions_b eb on eb.id=pb.id
-            where pa.is_embedded=false and
-            ea.parse_exception_id is null and
-            eb.parse_exception_id is null
-            and pa.num_attachments &lt;&gt; pb.num_attachments
-            order by ma.mime_string, pb.num_attachments-pa.num_attachments
-            limit 100000;
-        </sql>
-    </report>
-
-    <report reportName="Attachment Diffs with exceptions"
-            reportFilename="attachments/attachment_diffs_with_exceptions.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select file_path,
-            c.length as CONTAINER_LENGTH,
-            ma.mime_string as MIME_STRING_A,
-            mb.mime_string as MIME_STRING_B,
-            pa.num_attachments as NUM_ATTACHMENTS_A,
-            pb.num_attachments as NUM_ATTACHMENTS_B,
-            pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B,
-            refea.parse_exception_description as PARSE_EXCEPTION_A,
-            refeb.parse_exception_description as PARSE_EXCEPTION_B
-            from profiles_a pa
-            join profiles_b pb on pa.id= pb.id
-            join containers c on pa.container_id=c.container_id
-            join mimes ma on pa.mime_id=ma.mime_id
-            join mimes mb on pb.mime_id=mb.mime_id
-            left join exceptions_a ea on ea.id=pa.id
-            left join exceptions_b eb on eb.id=pb.id
-            left join ref_parse_exception_types refea on ea.parse_exception_id=refea.parse_exception_id
-            left join ref_parse_exception_types refeb on eb.parse_exception_id=refeb.parse_exception_id
-            where pa.is_embedded=false
-            and pa.num_attachments &lt;&gt; pb.num_attachments
-            order by ma.mime_string, pb.num_attachments-pa.num_attachments
-            limit 100000;
-        </sql>
-    </report>
-
-    <report reportName="Files missing in B by Mime"
-            reportFilename="attachments/all_files_missing_in_B_by_mime.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) as cnt
-            from profiles_a pa
-            left join profiles_b pb on pa.id=pb.id
-            join mimes m on pa.mime_id=m.mime_id
-            where pb.id is null
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-    <report reportName="Container files missing in B by Mime"
-            reportFilename="attachments/container_files_missing_in_B_by_mime.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) as cnt
-            from profiles_a pa
-            left join profiles_b pb on pa.id=pb.id
-            join mimes m on pa.mime_id=m.mime_id
-            where pb.id is null and pa.is_embedded=false
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-    <report reportName="Embedded files missing in B by Mime"
-            reportFilename="attachments/embedded_files_missing_in_B_by_mime.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) as cnt
-            from profiles_a pa
-            left join profiles_b pb on pa.id=pb.id
-            join mimes m on pa.mime_id=m.mime_id
-            where pb.id is null and pa.is_embedded=true
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-    <report reportName="All files missing in A by Mime"
-            reportFilename="attachments/all_files_missing_in_A_by_mime.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) as cnt
-            from profiles_b pb
-            left join profiles_a pa on pb.id=pa.id
-            join mimes m on pb.mime_id=m.mime_id
-            where pa.id is null
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-    <report reportName="Container files missing in A by Mime"
-            reportFilename="attachments/container_files_missing_in_A_by_mime.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) as cnt
-            from profiles_b pb
-            left join profiles_a pa on pb.id=pa.id
-            join mimes m on pb.mime_id=m.mime_id
-            where pa.id is null and pb.is_embedded=false
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-    <report reportName="Embedded files missing in A by Mime"
-            reportFilename="attachments/embedded_files_missing_in_A_by_mime.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select mime_string, count(1) as cnt
-            from profiles_b pb
-            left join profiles_a pa on pb.id=pa.id
-            join mimes m on pb.mime_id=m.mime_id
-            where pa.id is null and pb.is_embedded=true
-            group by mime_string
-            order by cnt desc
-        </sql>
-    </report>
-    <!-- metadata values -->
-    <report reportName="Metadata Value Diffs"
-            reportFilename="metadata/metadata_value_count_diffs.xlsx"
-            format="xlsx"
-            includeSql="true">
-
-        <sql>
-            select file_path,
-            ma.mime_string as mime_string_a,
-            mb.mime_string as mime_string_b,
-            pa.num_metadata_values as num_metadata_values_a,
-            pb.num_metadata_values as num_metadata_values_b,
-            ea.parse_exception_id as parse_ex_id_a,
-            eb.parse_exception_id as parse_ex_id_b
-            from profiles_a pa
-            join profiles_b pb on pa.id= pb.id
-            join containers c on pa.container_id=c.container_id
-            join mimes ma on pa.mime_id=ma.mime_id
-            join mimes mb on pb.mime_id=mb.mime_id
-            left join exceptions_a ea on ea.id=pa.id
-            left join exceptions_b eb on eb.id=pb.id
-            where
-            ea.parse_exception_id is null and
-            eb.parse_exception_id is null
-            and pa.num_metadata_values &lt;&gt; pb.num_metadata_values
-            order by ma.mime_string,
-            pb.num_metadata_values-pa.num_metadata_values
-            limit 100000
-        </sql>
-    </report>
-    <report reportName="Tag Count Diffs By Mime"
-            reportFilename="tags/tag_count_diffs_by_mime.xlsx"
-            format="xlsx"
-            includeSql="true">
-        <sql>
-            select ma.mime_string as mime_string_a,
-            mb.mime_string as mime_string_b,
-            tags_a_a,
-            tags_a_b,
-            tags_b_a,
-            tags_b_b,
-            tags_div_a,
-            tags_div_b,
-            tags_i_a,
-            tags_i_b,
-            tags_li_a,
-            tags_li_b,
-            tags_ol_a,
-            tags_ol_b,
-            tags_p_a,
-            tags_p_b,
-            tags_table_a,
-            tags_table_b,
-            tags_td_a,
-            tags_td_b,
-            tags_title_a,
-            tags_title_b,
-            tags_tr_a,
-            tags_tr_b,
-            tags_u_a,
-            tags_u_b,
-            tags_ul_a,
-            tags_ul_b
-            from
-            tags_by_mime tbm
-            join mimes ma on tbm.mime_id_a=ma.mime_id
-            join mimes mb on tbm.mime_id_b=mb.mime_id
-            limit 100000
-        </sql>
-
-    </report>
-    <report reportName="Tag Exceptions By Mime"
-            reportFilename="tags/tag_exceptions_by_mime.xlsx"
-            format="xlsx"
-            includeSql="true">
-        <sql>
-            select ma.mime_string as mime_string_a,
-            mb.mime_string as mime_string_b,
-            tag_exceptions_a,
-            tag_exceptions_b,
-            (tag_exceptions_b-tag_exceptions_a) as diff_tag_exceptions_in_b
-            from tag_exceptions_by_mime tebm
-            join mimes ma on tebm.mime_id_a=ma.mime_id
-            join mimes mb on tebm.mime_id_b=mb.mime_id
-            order by diff_tag_exceptions_in_b desc
-        </sql>
-    </report>
-    <report reportName="Tag Exceptions Details A"
-                         reportFilename="tags/tag_exceptions_details_a.xlsx"
-                         format="xlsx"
-                         includeSql="true">
-        <sql>
-            select c.file_path,pa.file_name,mime_string,is_embedded from
-            tags_a ta
-            join profiles_a pa on ta.id=pa.id
-            join containers c on pa.container_id=c.container_id
-            join mimes m on pa.mime_id=m.mime_id
-            where ta.tags_parse_exception=true
-            order by m.mime_string
-            limit 20000
-        </sql>
-    </report>
-    <report reportName="Tag Exceptions Details B"
-            reportFilename="tags/tag_exceptions_details_b.xlsx"
-            format="xlsx"
-            includeSql="true">
-        <sql>
-            select c.file_path,pb.file_name,mime_string,is_embedded from
-            tags_b tb
-            join profiles_b pb on tb.id=pb.id
-            join containers c on pb.container_id=c.container_id
-            join mimes m on pb.mime_id=m.mime_id
-            where tb.tags_parse_exception=true
-            order by m.mime_string
-            limit 20000
-        </sql>
-    </report>
-
-    <report reportName="Parse Time (Millis) Compared"
-            reportFilename="parse_times/parse_time_millis_by_mime_compared.xlsx"
-            format="xlsx"
-            includeSql="true">
-        <sql>
-            select ma.mime_string as MIME_STRING_A,
-            mb.mime_string as MIME_STRING_B,
-            total_a as TOTAL_MILLIS_A, total_b as TOTAL_MILLIS_B,
-            prcnt_increase as PERCENT_INCREASE
-            from parse_time_compared ptc
-            join mimes ma on ptc.mime_id_a=ma.mime_id
-            join mimes mb on ptc.mime_id_b=mb.mime_id
-            where TOTAL_A &gt; 1000 AND TOTAL_B &gt; 1000 -- only show comparisons if &gt; a second
-            order by prcnt_increase desc
-        </sql>
-    </report>
-    <report reportName="Parse Time (Millis) Details"
-            reportFilename="parse_times/parse_time_millis_details.xlsx"
-            format="xlsx"
-            includeSql="true">
-        <sql>
-            select file_path, c.length as CONTAINTER_LENGTH,
-            ma.mime_string as MIME_STRING_A,
-            mb.mime_string as MIME_STRING_B,
-            pa.elapsed_time_millis as TOTAL_MILLIS_A,
-            pb.elapsed_time_millis as TOTAL_MILLIS_B,
-            (pb.elapsed_time_millis-pa.elapsed_time_millis) as DIFF_MILLIS
-            from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            join mimes ma on ma.mime_id=pa.mime_id
-            join mimes mb on mb.mime_id=pb.mime_id
-            join containers c on pa.container_id=c.container_id
-            order by DIFF_MILLIS desc
-            limit 20000;
-        </sql>
-    </report>
-    <after>
-        <sql>drop table if exists md5_multiples_tmp_a</sql>
-        <sql>drop table if exists md5_multiples_tmp_b</sql>
-    </after>
-</reports>
\ No newline at end of file
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
deleted file mode 100644
index e23ec5e..0000000
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ /dev/null
@@ -1,1753 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
-
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one
-  or more contributor license agreements.  See the NOTICE file
-  distributed with this work for additional information
-  regarding copyright ownership.  The ASF licenses this file
-  to you under the Apache License, Version 2.0 (the
-  "License"); you may not use this file except in compliance
-  with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing,
-  software distributed under the License is distributed on an
-  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-  KIND, either express or implied.  See the License for the
-  specific language governing permissions and limitations
-  under the License.
--->
-
-<reports>
-
-
-    <before>
-
-        <sql>drop table if exists md5_multiples_tmp_a</sql>
-        <sql>create table md5_multiples_tmp_a (MD5 char(32), cnt int)
-            as
-            select md5, count(1) cnt
-            from profiles_a
-            where md5 is not null
-            group by md5
-            having cnt &gt; 1
-            order by cnt desc
-        </sql>
-
-        <sql>drop table if exists md5_multiples_tmp_b</sql>
-        <sql>create table md5_multiples_tmp_b (MD5 char(32), cnt int)
-            as
-            select md5, count(1) cnt
-            from profiles_b
-            where md5 is not null
-            group by md5
-            having cnt &gt; 1
-            order by cnt desc
-        </sql>
-        <!-- build mime indexes -->
-
-        <sql>create index if not exists pa_m_idx
-            on profiles_a (mime_id);
-        </sql>
-
-        <sql>
-            create index if not exists pb_m_idx
-            on profiles_b (mime_id);
-        </sql>
-
-        <!-- build exceptions comparison table -->
-        <sql>drop table if exists exceptions_compared</sql>
-        <sql>
-            create table exceptions_compared (
-            mime_id_a integer,
-            mime_id_b integer,
-            total integer,
-            exc_cnt_a integer,
-            exc_cnt_b integer,
-            exc_prcnt_a float,
-            exc_prcnt_b float,
-            notes varchar(12)
-            );
-        </sql>
-        <sql>
-            insert into exceptions_compared (
-            select ma.mime_id, mb.mime_id, count(1) as total, 0, 0, 0.0, 0.0, ''
-            from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            join mimes ma on pa.mime_id = ma.mime_id
-            join mimes mb on pb.mime_id = mb.mime_id
-            group by ma.mime_id, mb.mime_id
-            order by total desc );
-        </sql>
-
-        <sql>
-            update exceptions_compared ec set
-            exc_cnt_a = (
-            select count(1) as cnt
-            from exceptions_a ea
-            join profiles_a pa on ea.id=pa.id
-            join profiles_b pb on pb.id=pa.id
-            join mimes ma on pa.mime_id=ma.mime_id
-            join mimes mb on pb.mime_id=mb.mime_id
-            where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
-            group by ma.mime_id, mb.mime_id);
-        </sql>
-        <sql>
-            update exceptions_compared ec set
-            exc_cnt_b = (
-            select count(1) as cnt
-            from exceptions_b eb
-            join profiles_b pb on eb.id=pb.id
-            join profiles_a pa on pa.id=pb.id
-            join mimes ma on pa.mime_id=ma.mime_id
-            join mimes mb on pb.mime_id=mb.mime_id
-            where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
-            group by mb.mime_id, ma.mime_id);
-        </sql>
-        <sql>
-            update exceptions_compared
-            set exc_prcnt_a = cast(exc_cnt_a as decimal)/cast(total as decimal)
-            where total > 0;
-        </sql>
-        <sql>
-            update exceptions_compared
-            set exc_prcnt_b = cast(exc_cnt_b as decimal)/cast(total as decimal)
-            where total > 0;
-        </sql>
-
-        <sql>
-            update exceptions_compared
-            set notes = 'YAY!'
-            where total > 100 and (exc_prcnt_a-exc_prcnt_b) > 0.10;
-        </sql>
-        <sql>
-            update exceptions_compared
-            set notes = 'YIKES!'
-            where total > 100 and (exc_prcnt_b-exc_prcnt_a) > 0.10;
-        </sql>
-
-        <!-- build tmp common words table -->
-        <sql>drop table if exists token_counts_compared</sql>
-        <sql>
-            create table token_counts_compared
-            (mime_id_a integer,
-            mime_id_b integer,
-            num_tokens_a long default 0,
-            num_tokens_b long default 0,
-            num_alphabetic_tokens_a long default 0,
-            num_alphabetic_tokens_b long default 0,
-            num_common_tokens_a long default 0,
-            num_common_tokens_b long default 0
-            );
-        </sql>
-        <sql>
-            insert into token_counts_compared (mime_id_a, mime_id_b)
-            select ma.mime_id, mb.mime_id
-            from profiles_a a
-            join profiles_b b on a.id=b.id
-            join mimes ma on ma.mime_id=a.mime_id
-            join mimes mb on mb.mime_id=b.mime_id
-            group by ma.mime_id, mb.mime_id
-
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_tokens_a=(
-            select sum(num_tokens) as cnt from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            join contents_a c on c.id = pa.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_tokens_b=(
-            select sum(num_tokens) as cnt from profiles_b pb
-            join profiles_a pa on pa.id=pb.id
-            join contents_b c on c.id = pb.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_alphabetic_tokens_a=(
-            select sum(num_alphabetic_tokens) as cnt from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            join contents_a c on c.id = pa.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_alphabetic_tokens_b=(
-            select sum(num_alphabetic_tokens) as cnt from profiles_b pb
-            join profiles_a pa on pb.id=pa.id
-            join contents_b c on c.id = pb.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_common_tokens_a=(
-            select sum(num_common_tokens) as cnt from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            join contents_a c on c.id = pa.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>
-            update token_counts_compared tcc set num_common_tokens_b=(
-            select sum(num_common_tokens) as cnt from profiles_b pb
-            join profiles_a pa on pa.id=pb.id
-            join contents_b c on c.id = pb.id
-            where pb.mime_id= tcc.mime_id_b
-            and pa.mime_id=tcc.mime_id_a
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-
-        <sql>drop table if exists tags_by_mime</sql>
-        <sql>create table tags_by_mime (
-                mime_id_a integer,
-                mime_id_b integer,
-                tags_a_a integer,
-                tags_b_a integer,
-                tags_div_a integer,
-                tags_i_a integer,
-                tags_img_a integer,
-                tags_li_a integer,
-                tags_ol_a integer,
-                tags_p_a integer,
-                tags_table_a integer,
-                tags_td_a integer,
-                tags_title_a integer,
-                tags_tr_a integer,
-                tags_u_a integer,
-                tags_ul_a integer,
-                tags_a_b integer,
-                tags_b_b integer,
-                tags_div_b integer,
-                tags_i_b integer,
-                tags_img_b integer,
-                tags_li_b integer,
-                tags_ol_b integer,
-                tags_p_b integer,
-                tags_table_b integer,
-                tags_td_b integer,
-                tags_title_b integer,
-                tags_tr_b integer,
-                tags_u_b integer,
-                tags_ul_b integer
-            );
-        </sql>
-        <sql>
-            insert into tags_by_mime (mime_id_a, mime_id_b)
-            select ma.mime_id, mb.mime_id
-            from profiles_a a
-            join profiles_b b on a.id=b.id
-            join mimes ma on ma.mime_id=a.mime_id
-            join mimes mb on mb.mime_id=b.mime_id
-            group by ma.mime_id, mb.mime_id
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_a_a=(
-            select sum(ta.tags_a) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_b_a=(
-            select sum(ta.tags_b) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_div_a=(
-            select sum(ta.tags_div) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_i_a=(
-            select sum(ta.tags_i) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_img_a=(
-            select sum(ta.tags_img) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_li_a=(
-            select sum(ta.tags_li) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_ol_a=(
-            select sum(ta.tags_ol) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_p_a=(
-            select sum(ta.tags_p) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_table_a=(
-            select sum(ta.tags_table) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_td_a=(
-            select sum(ta.tags_td) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_title_a=(
-            select sum(ta.tags_title) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_tr_a=(
-            select sum(ta.tags_tr) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_u_a=(
-            select sum(ta.tags_u) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_ul_a=(
-            select sum(ta.tags_ul) as cnt from tags_a ta
-            join tags_b tb on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <!-- now update tags_b counts -->
-        <sql>
-            update tags_by_mime tbm set tags_a_b=(
-            select sum(tb.tags_a) as cnt from tags_b tb
-            join tags_a ta on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_b_b=(
-            select sum(tb.tags_b) as cnt from tags_b tb
-            join tags_a ta on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_div_b=(
-            select sum(tb.tags_div) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_i_b=(
-            select sum(tb.tags_i) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_img_b=(
-            select sum(tb.tags_img) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_li_b=(
-            select sum(tb.tags_li) as cnt from tags_b tb
-            join tags_a ta on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_ol_b=(
-            select sum(tb.tags_ol) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_p_b=(
-            select sum(tb.tags_p) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_table_b=(
-            select sum(tb.tags_table) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_td_b=(
-            select sum(tb.tags_td) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_title_b=(
-            select sum(tb.tags_title) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_tr_b=(
-            select sum(tb.tags_tr) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_u_b=(
-            select sum(tb.tags_u) as cnt from tags_b tb
-            join tags_a ta on tb.id=ta.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tags_by_mime tbm set tags_ul_b=(
-            select sum(tb.tags_ul) as cnt from tags_b tb
-            join tags_a ta on ta.id=tb.id
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tbm.mime_id_b
-            and pa.mime_id=tbm.mime_id_a
-            and ta.tags_parse_exception=false
-            and tb.tags_parse_exception=false
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>drop table if exists tag_exceptions_by_mime</sql>
-        <sql>create table tag_exceptions_by_mime (
-            mime_id_a integer,
-            mime_id_b integer,
-            tag_exceptions_a integer,
-            tag_exceptions_b integer)
-        </sql>
-        <sql>
-            insert into tag_exceptions_by_mime (mime_id_a, mime_id_b,
-                tag_exceptions_a, tag_exceptions_b)
-            select ma.mime_id, mb.mime_id,0,0
-            from profiles_a a
-            join profiles_b b on a.id=b.id
-            join mimes ma on ma.mime_id=a.mime_id
-            join mimes mb on mb.mime_id=b.mime_id
-            group by ma.mime_id, mb.mime_id
-        </sql>
-        <sql>
-            update tag_exceptions_by_mime tebm set tag_exceptions_a=(
-            select count(1) as cnt from tags_a ta
-            join profiles_a pa on pa.id=ta.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tebm.mime_id_b
-            and pa.mime_id=tebm.mime_id_a
-            and ta.tags_parse_exception=true
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            update tag_exceptions_by_mime tebm set tag_exceptions_b=(
-            select count(1) as cnt from tags_b tb
-            join profiles_a pa on pa.id=tb.id
-            join profiles_b pb on pa.id=pb.id
-            where pb.mime_id= tebm.mime_id_b
-            and pa.mime_id=tebm.mime_id_a
-            and tb.tags_parse_exception=true
-            group by mime_id_a, mime_id_b
-            );
-        </sql>
-        <sql>
-            drop table if exists parse_time_compared;
-        </sql>
-        <sql>
-            create table parse_time_compared (
-            mime_id_a integer,
-            mime_id_b integer,
-            total_a bigint,
-            total_b bigint,
-            prcnt_increase double
-            );
-        </sql>
-            <sql>
-                insert into parse_time_compared (mime_id_a, mime_id_b,
-                total_a, total_b, prcnt_increase)
-                select ma.mime_id, mb.mime_id,0,0,0.0
-                from profiles_a a
-                join profiles_b b on a.id=b.id
-                join mimes ma on ma.mime_id=a.mime_id
-                join mimes mb on mb.mime_id=b.mime_id
-                group by ma.mime_id, mb.mime_id
-            </sql>
-        <sql>
-            update parse_time_compared ptc set total_a=(
-            select sum(pa.elapsed_time_millis) as total_a from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            where pa.mime_id= ptc.mime_id_a
-            and pb.mime_id=ptc.mime_id_b
-            group by mime_id_a, mime_id_b)
-        </sql>
-        <sql>
-            update parse_time_compared ptc set total_b=(
-            select sum(pb.elapsed_time_millis) as total_b from profiles_b pb
-            join profiles_a pa on pa.id=pb.id
-            where pa.mime_id= ptc.mime_id_a
... 6754 lines suppressed ...