You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/12/02 17:04:59 UTC
[tika] branch main updated: TIKA-3241 -- fix more git add problems
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 1219866 TIKA-3241 -- fix more git add problems
1219866 is described below
commit 12198666bf0cf6811613c9ae27c4a384e42875b1
Author: tallison <ta...@apache.org>
AuthorDate: Wed Dec 2 12:04:33 2020 -0500
TIKA-3241 -- fix more git add problems
---
.../org/apache/tika/eval/app/AbstractProfiler.java | 821 +++++++++
.../org/apache/tika/eval/app/EvalFilePaths.java | 118 ++
.../org/apache/tika/eval/app/ExtractComparer.java | 538 ++++++
.../org/apache/tika/eval/app/ExtractProfiler.java | 281 ++++
.../org/apache/tika/eval/app/FileProfiler.java | 195 +++
.../java/org/apache/tika/eval/app/TikaEvalCLI.java | 326 ++++
.../apache/tika/eval/app/XMLErrorLogUpdater.java | 223 +++
.../tika/eval/app/batch/DBConsumersManager.java | 103 ++
.../tika/eval/app/batch/EvalConsumerBuilder.java | 240 +++
.../tika/eval/app/batch/EvalConsumersBuilder.java | 130 ++
.../eval/app/batch/ExtractComparerBuilder.java | 165 ++
.../eval/app/batch/ExtractProfilerBuilder.java | 122 ++
.../tika/eval/app/batch/FileProfilerBuilder.java | 98 ++
.../apache/tika/eval/app/db/AbstractDBBuffer.java | 77 +
.../java/org/apache/tika/eval/app/db/ColInfo.java | 127 ++
.../java/org/apache/tika/eval/app/db/Cols.java | 115 ++
.../java/org/apache/tika/eval/app/db/DBBuffer.java | 54 +
.../java/org/apache/tika/eval/app/db/H2Util.java | 89 +
.../java/org/apache/tika/eval/app/db/JDBCUtil.java | 316 ++++
.../org/apache/tika/eval/app/db/MimeBuffer.java | 147 ++
.../org/apache/tika/eval/app/db/TableInfo.java | 73 +
.../java/org/apache/tika/eval/app/io/DBWriter.java | 162 ++
.../org/apache/tika/eval/app/io/ExtractReader.java | 243 +++
.../tika/eval/app/io/ExtractReaderException.java | 48 +
.../org/apache/tika/eval/app/io/IDBWriter.java | 31 +
.../apache/tika/eval/app/io/XMLLogMsgHandler.java | 26 +
.../org/apache/tika/eval/app/io/XMLLogReader.java | 115 ++
.../org/apache/tika/eval/app/reports/Report.java | 198 +++
.../tika/eval/app/reports/ResultsReporter.java | 313 ++++
.../tika/eval/app/reports/XLSXHREFFormatter.java | 78 +
.../tika/eval/app/reports/XLSXNumFormatter.java | 54 +
.../tika/eval/app/reports/XSLXCellFormatter.java | 30 +
.../eval/app/tools/BatchTopCommonTokenCounter.java | 54 +
.../eval/app/tools/CommonTokenOverlapCounter.java | 69 +
.../apache/tika/eval/app/tools/LeipzigHelper.java | 57 +
.../apache/tika/eval/app/tools/LeipzigSampler.java | 76 +
.../eval/app/tools/SlowCompositeReaderWrapper.java | 327 ++++
.../tika/eval/app/tools/TopCommonTokenCounter.java | 401 +++++
.../apache/tika/eval/app/tools/TrainTestSplit.java | 114 ++
.../src/main/resources/comparison-reports-pg.xml | 1753 ++++++++++++++++++++
.../src/main/resources/comparison-reports.xml | 1753 ++++++++++++++++++++
.../tika-eval-app/src/main/resources/db.properties | 23 +
.../src/main/resources/profile-reports.xml | 329 ++++
.../main/resources/tika-eval-comparison-config.xml | 102 ++
.../resources/tika-eval-file-profiler-config.xml | 74 +
.../main/resources/tika-eval-profiler-config.xml | 96 ++
.../apache/tika/eval/app/AnalyzerManagerTest.java | 100 ++
.../apache/tika/eval/app/ComparerBatchTest.java | 399 +++++
.../org/apache/tika/eval/app/MockDBWriter.java | 83 +
.../apache/tika/eval/app/ProfilerBatchTest.java | 234 +++
.../apache/tika/eval/app/SimpleComparerTest.java | 439 +++++
.../org/apache/tika/eval/app/TikaEvalCLITest.java | 293 ++++
.../tika/eval/app/db/AbstractBufferTest.java | 160 ++
.../apache/tika/eval/app/io/ExtractReaderTest.java | 86 +
.../tika/eval/app/io/FatalExceptionReaderTest.java | 33 +
.../tika/eval/app/reports/ResultsReporterTest.java | 60 +
.../eval/app/tools/TopCommonTokenCounterTest.java | 94 ++
.../src/test/resources/common_tokens/en | 28 +
.../src/test/resources/common_tokens/es | 30 +
.../src/test/resources/common_tokens/zh-cn | 28 +
.../src/test/resources/common_tokens/zh-tw | 28 +
.../single-file-profiler-crawl-extract-config.xml | 76 +
.../single-file-profiler-crawl-input-config.xml | 77 +
.../test-dirs/batch-logs/batch-process-fatal.xml | 59 +
.../resources/test-dirs/extractsA/file1.pdf.json | 5 +
.../test-dirs/extractsA/file10_permahang.txt.json | 0
.../test-dirs/extractsA/file11_oom.txt.json | 0
.../test-dirs/extractsA/file12_es.txt.json | 4 +
.../extractsA/file13_attachANotB.doc.json | 11 +
.../extractsA/file14_diffAttachOrder.json | 19 +
.../resources/test-dirs/extractsA/file15_tags.json | 41 +
.../test-dirs/extractsA/file16_badTags.json | 41 +
.../test-dirs/extractsA/file17_tagsOutOfOrder.json | 41 +
.../test-dirs/extractsA/file2_attachANotB.doc.json | 10 +
.../test-dirs/extractsA/file3_attachBNotA.doc.json | 4 +
.../test-dirs/extractsA/file4_emptyB.pdf.json | 4 +
.../test-dirs/extractsA/file5_emptyA.pdf.json | 0
.../test-dirs/extractsA/file6_accessEx.pdf.json | 1 +
.../test-dirs/extractsA/file7_badJson.pdf.json | 4 +
.../test-dirs/extractsA/file8_IOEx.pdf.json | 1 +
.../resources/test-dirs/extractsB/file1.pdf.json | 2 +
.../test-dirs/extractsB/file11_oom.txt.json | 0
.../test-dirs/extractsB/file12_es.txt.json | 4 +
.../test-dirs/extractsB/file13_attachANotB.doc.txt | 1 +
.../extractsB/file14_diffAttachOrder.json | 19 +
.../resources/test-dirs/extractsB/file15_tags.html | 31 +
.../test-dirs/extractsB/file16_badTags.html | 31 +
.../test-dirs/extractsB/file2_attachANotB.doc.json | 4 +
.../test-dirs/extractsB/file3_attachBNotA.doc.json | 10 +
.../test-dirs/extractsB/file4_emptyB.pdf.json | 0
.../test-dirs/extractsB/file5_emptyA.pdf.json | 4 +
.../test-dirs/extractsB/file6_accessEx.pdf.json | 1 +
.../test-dirs/extractsB/file7_badJson.pdf.json | 0
.../test-dirs/extractsB/file8_IOEx.pdf.json | 1 +
.../test/resources/test-dirs/raw_input/file1.pdf | 13 +
.../resources/test-dirs/raw_input/file11_oom.txt | 2 +
.../test-dirs/raw_input/file2_attachANotB.doc | 13 +
.../test-dirs/raw_input/file3_attachBNotA.doc | 13 +
.../resources/test-dirs/raw_input/file4_emptyB.pdf | 13 +
.../resources/test-dirs/raw_input/file5_emptyA.pdf | 13 +
.../test-dirs/raw_input/file6_accessEx.pdf | 13 +
.../test-dirs/raw_input/file7_badJson.pdf | 13 +
.../resources/test-dirs/raw_input/file8_IOEx.pdf | 13 +
.../test-dirs/raw_input/file9_noextract.txt | 1 +
.../parser/iwork/iwana/IWork13PackageParser.java | 159 ++
.../apache/tika/parser/microsoft/EMFParser.java | 168 ++
.../tika/parser/microsoft/ExcelExtractor.java | 729 ++++++++
.../tika/parser/microsoft/FormattingUtils.java | 104 ++
.../tika/parser/microsoft/HSLFExtractor.java | 559 +++++++
.../tika/parser/microsoft/JackcessParser.java | 130 ++
.../apache/tika/parser/microsoft/LinkedCell.java | 41 +
.../tika/parser/microsoft/MSOwnerFileParser.java | 90 +
.../apache/tika/parser/microsoft/OfficeParser.java | 346 ++++
.../tika/parser/microsoft/OldExcelParser.java | 94 ++
.../tika/parser/microsoft/WordExtractor.java | 714 ++++++++
.../tika/parser/microsoft/chm/ChmAssert.java | 165 ++
.../tika/parser/microsoft/chm/ChmCommons.java | 365 ++++
.../tika/parser/microsoft/chm/ChmConstants.java | 102 ++
.../microsoft/chm/ChmDirectoryListingSet.java | 409 +++++
.../tika/parser/microsoft/chm/ChmItsfHeader.java | 484 ++++++
.../tika/parser/microsoft/chm/ChmPmgiHeader.java | 168 ++
.../tika/parser/microsoft/chm/ChmPmglHeader.java | 196 +++
.../tika/parser/microsoft/chm/ChmSection.java | 221 +++
.../tika/parser/microsoft/chm/ChmWrapper.java | 140 ++
.../tika/parser/microsoft/onenote/CompactID.java | 60 +
.../parser/microsoft/onenote/ExtendedGUID.java | 87 +
.../microsoft/onenote/FileChunkReference.java | 99 ++
.../tika/parser/microsoft/onenote/FileNode.java | 278 ++++
.../tika/parser/microsoft/onenote/FileNodePtr.java | 65 +
.../microsoft/onenote/FndStructureConstants.java | 166 ++
.../microsoft/onenote/GlobalIdTableEntry2FNDX.java | 40 +
.../onenote/ObjectDeclarationWithRefCount.java | 75 +
.../onenote/ObjectDeclarationWithRefCountBody.java | 73 +
.../onenote/ObjectInfoDependencyOverrides.java | 30 +
.../onenote/ObjectRevisionWithRefCountFNDX.java | 70 +
...ctSpaceObjectStreamOfOIDsOSIDsOrContextIDs.java | 63 +
.../onenote/OneNoteDirectFileResource.java | 87 +
.../onenote/OneNoteLegacyDumpStrings.java | 163 ++
.../parser/microsoft/onenote/OneNoteParser.java | 176 ++
.../microsoft/onenote/OneNotePropertyId.java | 86 +
.../parser/microsoft/onenote/PropertyValue.java | 137 ++
.../tika/parser/microsoft/onenote/Revision.java | 72 +
.../onenote/RevisionManifestListStart.java | 30 +
.../microsoft/onenote/RevisionRoleDeclaration.java | 30 +
.../microsoft/onenote/RootObjectReference.java | 40 +
.../microsoft/ooxml/OOXMLExtractorFactory.java | 304 ++++
.../tika/parser/microsoft/ooxml/OOXMLParser.java | 115 ++
.../ooxml/POIXMLTextExtractorDecorator.java | 49 +
.../microsoft/ooxml/ParagraphProperties.java | 56 +
.../tika/parser/microsoft/ooxml/RunProperties.java | 73 +
.../ooxml/XSSFBExcelExtractorDecorator.java | 174 ++
.../ooxml/XSSFExcelExtractorDecorator.java | 604 +++++++
.../microsoft/ooxml/xps/XPSTextExtractor.java | 67 +
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 367 ++++
.../ooxml/xwpf/ml2006/BinaryDataHandler.java | 120 ++
.../microsoft/ooxml/xwpf/ml2006/PartHandler.java | 34 +
.../microsoft/ooxml/xwpf/ml2006/Relationship.java | 52 +
.../ooxml/xwpf/ml2006/Word2006MLDocHandler.java | 171 ++
.../tika/parser/microsoft/rtf/ListDescriptor.java | 35 +
.../parser/microsoft/rtf/RTFEmbObjHandler.java | 251 +++
.../parser/microsoft/rtf/RTFObjDataParser.java | 342 ++++
.../microsoft/xml/AbstractXML2003Parser.java | 136 ++
.../parser/microsoft/xml/HyperlinkHandler.java | 96 ++
.../tika/parser/microsoft/xml/WordMLParser.java | 309 ++++
.../services/org.apache.tika.parser.Parser | 29 +
165 files changed, 24287 insertions(+)
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
new file mode 100644
index 0000000..910fe09
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
@@ -0,0 +1,821 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app;
+
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Types;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.eval.core.langid.LanguageIDWrapper;
+import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator;
+import org.apache.tika.eval.core.textstats.CommonTokens;
+import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator;
+import org.apache.tika.eval.core.textstats.ContentLengthCalculator;
+import org.apache.tika.eval.core.textstats.TextStatsCalculator;
+import org.apache.tika.eval.core.textstats.TokenEntropy;
+import org.apache.tika.eval.core.textstats.TokenLengths;
+import org.apache.tika.eval.core.textstats.TopNTokens;
+import org.apache.tika.eval.core.textstats.UnicodeBlockCounter;
+import org.apache.tika.eval.core.tokens.AnalyzerManager;
+import org.apache.tika.eval.core.tokens.CommonTokenCountManager;
+import org.apache.tika.eval.core.tokens.CommonTokenResult;
+import org.apache.tika.eval.core.tokens.TokenCounts;
+import org.apache.tika.eval.core.tokens.TokenIntPair;
+import org.apache.tika.eval.core.util.ContentTagParser;
+import org.apache.tika.eval.core.util.ContentTags;
+import org.apache.tika.eval.core.util.EvalExceptionUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
+
+public abstract class AbstractProfiler extends FileResourceConsumer {
+
+ private static final Logger LOG = LoggerFactory.getLogger(AbstractProfiler.class);
+
+ private static final String[] EXTRACT_EXTENSIONS = {
+ ".json",
+ ".txt",
+ ""
+ };
+
+ private static final String[] COMPRESSION_EXTENSIONS = {
+ "",
+ ".bz2",
+ ".gzip",
+ ".zip",
+ };
+ static final long NON_EXISTENT_FILE_LENGTH = -1l;
+
+ public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new TableInfo("ref_extract_exception_types",
+ new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER),
+ new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
+ );
+
+
+ public static TableInfo REF_PARSE_ERROR_TYPES = new TableInfo("ref_parse_error_types",
+ new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER),
+ new ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128)
+ );
+
+ public static TableInfo REF_PARSE_EXCEPTION_TYPES = new TableInfo("ref_parse_exception_types",
+ new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER),
+ new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
+ );
+
+ public static final String TRUE = Boolean.toString(true);
+ public static final String FALSE = Boolean.toString(false);
+ private static final String ZERO = "0";
+
+
+ protected static final AtomicInteger ID = new AtomicInteger();
+
+ private static final String UNKNOWN_EXTENSION = "unk";
+ //make this configurable
+ private static final String DIGEST_KEY = "X-TIKA:digest:MD5";
+
+ private static final Map<String, Cols> UC_TAGS_OF_INTEREST = initTags();
+
+ private static Map<String, Cols> initTags() {
+ //simplify this mess
+ Map<String, Cols> tmp = new HashMap<>();
+ tmp.put("A", Cols.TAGS_A);
+ tmp.put("B", Cols.TAGS_B);
+ tmp.put("DIV", Cols.TAGS_DIV);
+ tmp.put("I", Cols.TAGS_I);
+ tmp.put("IMG", Cols.TAGS_IMG);
+ tmp.put("LI", Cols.TAGS_LI);
+ tmp.put("OL", Cols.TAGS_OL);
+ tmp.put("P", Cols.TAGS_P);
+ tmp.put("TABLE", Cols.TAGS_TABLE);
+ tmp.put("TD", Cols.TAGS_TD);
+ tmp.put("TITLE", Cols.TAGS_TITLE);
+ tmp.put("TR", Cols.TAGS_TR);
+ tmp.put("U", Cols.TAGS_U);
+ tmp.put("UL", Cols.TAGS_UL);
+ return Collections.unmodifiableMap(tmp);
+ }
+
+ private static CommonTokenCountManager COMMON_TOKEN_COUNT_MANAGER;
+
+ private String lastExtractExtension = null;
+
+ AnalyzerManager analyzerManager;
+
+
+ public enum EXCEPTION_TYPE {
+ RUNTIME,
+ ENCRYPTION,
+ ACCESS_PERMISSION,
+ UNSUPPORTED_VERSION,
+ }
+
+ /**
+ * If information was gathered from the log file about
+ * a parse error
+ */
+ public enum PARSE_ERROR_TYPE {
+ OOM,
+ TIMEOUT
+ }
+
+ public static TableInfo MIME_TABLE = new TableInfo("mimes",
+ new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
+ new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
+ );
+
+ private static Pattern FILE_NAME_CLEANER = Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
+
+
+ final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for file_path
+ int maxContentLength = 10000000;
+ int maxContentLengthForLangId = 50000;
+ int maxTokens = 200000;
+
+
+
+ private final static Pattern ACCESS_PERMISSION_EXCEPTION =
+ Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException");
+ private final static Pattern ENCRYPTION_EXCEPTION =
+ Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException");
+
+ private static LanguageIDWrapper LANG_ID = new LanguageIDWrapper();
+
+ //TODO: allow configuration
+ //private TikaConfig config = TikaConfig.getDefaultConfig();
+ CompositeTextStatsCalculator compositeTextStatsCalculator;
+ protected IDBWriter writer;
+
+ /**
+ * @param p path to the common_tokens directory. If this is null, try to load from classPath
+ * @param defaultLangCode this is the language code to use if a common_words list doesn't exist for the
+ * detected langauge; can be <code>null</code>
+ * @throws IOException
+ */
+ public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException {
+ COMMON_TOKEN_COUNT_MANAGER = new CommonTokenCountManager(p, defaultLangCode);
+ }
+
+ public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue,
+ IDBWriter writer) {
+ super(fileQueue);
+ this.writer = writer;
+ LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
+ this.compositeTextStatsCalculator = initAnalyzersAndTokenCounter(maxTokens, LANG_ID);
+ }
+
+ private CompositeTextStatsCalculator initAnalyzersAndTokenCounter(int maxTokens, LanguageIDWrapper langIder) {
+ analyzerManager = AnalyzerManager.newInstance(maxTokens);
+ List<TextStatsCalculator> calculators = new ArrayList<>();
+ calculators.add(new CommonTokens(COMMON_TOKEN_COUNT_MANAGER));
+ calculators.add(new TokenEntropy());
+ calculators.add(new TokenLengths());
+ calculators.add(new TopNTokens(10));
+ calculators.add(new BasicTokenCountStatsCalculator());
+ calculators.add(new ContentLengthCalculator());
+ calculators.add(new UnicodeBlockCounter(maxContentLengthForLangId));
+
+ return new CompositeTextStatsCalculator(calculators, analyzerManager.getGeneralAnalyzer(), langIder);
+ }
+
+ /**
+ * Truncate the content string if greater than this length to this length
+ *
+ * @param maxContentLength
+ */
+ public void setMaxContentLength(int maxContentLength) {
+ this.maxContentLength = maxContentLength;
+ }
+
+ /**
+ * Truncate content string if greater than this length to this length for lang id
+ *
+ * @param maxContentLengthForLangId
+ */
+ public void setMaxContentLengthForLangId(int maxContentLengthForLangId) {
+ this.maxContentLengthForLangId = maxContentLengthForLangId;
+ LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
+ }
+
+ /**
+ * Add a LimitTokenCountFilterFactory if > -1
+ *
+ * @param maxTokens
+ */
+ public void setMaxTokens(int maxTokens) {
+ this.maxTokens = maxTokens;
+ initAnalyzersAndTokenCounter(maxTokens, new LanguageIDWrapper());
+ }
+
+
+ protected void writeExtractException(TableInfo extractExceptionTable, String containerId,
+ String filePath, ExtractReaderException.TYPE type) throws IOException {
+ Map<Cols, String> data = new HashMap<>();
+ data.put(Cols.CONTAINER_ID, containerId);
+ data.put(Cols.FILE_PATH, filePath);
+ data.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(type.ordinal()));
+ writer.writeRow(extractExceptionTable, data);
+
+ }
+
+ protected void writeProfileData(EvalFilePaths fps, int i,
+ ContentTags contentTags, Metadata m,
+ String fileId, String containerId,
+ List<Integer> numAttachments, TableInfo profileTable) {
+
+ Map<Cols, String> data = new HashMap<>();
+ data.put(Cols.ID, fileId);
+ data.put(Cols.CONTAINER_ID, containerId);
+ data.put(Cols.MD5, m.get(DIGEST_KEY));
+
+ if (i < numAttachments.size()) {
+ data.put(Cols.NUM_ATTACHMENTS, Integer.toString(numAttachments.get(i)));
+ }
+ data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
+ data.put(Cols.NUM_METADATA_VALUES,
+ Integer.toString(countMetadataValues(m)));
+
+ Integer nPages = m.getInt(PagedText.N_PAGES);
+ if (nPages != null) {
+ data.put(Cols.NUM_PAGES, Integer.toString(nPages));
+ }
+
+ //if the outer wrapper document
+ if (i == 0) {
+ data.put(Cols.IS_EMBEDDED, FALSE);
+ data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString());
+ } else {
+ data.put(Cols.IS_EMBEDDED, TRUE);
+ data.put(Cols.FILE_NAME, getFileName(m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)));
+ }
+ String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
+ ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
+ data.put(Cols.FILE_EXTENSION, ext);
+ long srcFileLen = getSourceFileLength(m);
+ if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
+ data.put(Cols.LENGTH, Long.toString(srcFileLen));
+ } else {
+ data.put(Cols.LENGTH, "");
+ }
+ int numMetadataValues = countMetadataValues(m);
+ data.put(Cols.NUM_METADATA_VALUES,
+ Integer.toString(numMetadataValues));
+
+ data.put(Cols.ELAPSED_TIME_MILLIS,
+ getTime(m));
+
+ String content = contentTags.getContent();
+ if (content == null || content.trim().length() == 0) {
+ data.put(Cols.HAS_CONTENT, FALSE);
+ } else {
+ data.put(Cols.HAS_CONTENT, TRUE);
+ }
+ getFileTypes(m, data);
+ try {
+ writer.writeRow(profileTable, data);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static String getFileName(String path) {
+ if (path == null) {
+ return "";
+ }
+ //filenameUtils checks for a null byte in the path.
+ //it will throw an IllegalArgumentException if there is a null byte.
+ //given that we're recording names and not using them on a file path
+ //we should ignore this.
+ try {
+ return FilenameUtils.getName(path);
+ } catch (IllegalArgumentException e) {
+ LOG.warn("{} in {}", e.getMessage(), path);
+ }
+ path = path.replaceAll("\u0000", " ");
+ try {
+ return FilenameUtils.getName(path);
+ } catch (IllegalArgumentException e) {
+ LOG.warn("Again: {} in {}", e.getMessage(), path);
+ }
+ //give up
+ return "";
+ }
+
+ protected void writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable) {
+ Map<Cols, String> data = new HashMap<>();
+ getExceptionStrings(m, data);
+ if (data.keySet().size() > 0) {
+ try {
+ data.put(Cols.ID, fileId);
+ writer.writeRow(exceptionTable, data);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ protected Map<Class, Object> calcTextStats(ContentTags contentTags) {
+/* if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) {
+ return Collections.EMPTY_MAP;
+ }*/
+ Map<Cols, String> data = new HashMap<>();
+ String content = truncateContent(contentTags, maxContentLength, data);
+ if (content == null || content.trim().length() == 0) {
+ content = "";
+ }
+ return compositeTextStatsCalculator.calculate(content);
+ }
+
+ /**
+ * Checks to see if metadata is null or content is empty (null or only whitespace).
+ * If any of these, then this does no processing, and the fileId is not
+ * entered into the content table.
+ *
+ * @param fileId
+ * @param textStats
+ * @param contentsTable
+ */
+ protected void writeContentData(String fileId, Map<Class, Object> textStats, TableInfo contentsTable) throws IOException {
+ Map<Cols, String> data = new HashMap<>();
+ data.put(Cols.ID, fileId);
+ if (textStats.containsKey(ContentLengthCalculator.class)) {
+ int length = (int)textStats.get(ContentLengthCalculator.class);
+ if (length == 0) {
+ return;
+ }
+ data.put(Cols.CONTENT_LENGTH, Integer.toString(length));
+ }
+ langid(textStats, data);
+
+ writeTokenCounts(textStats, data);
+ CommonTokenResult commonTokenResult = (CommonTokenResult) textStats.get(CommonTokens.class);
+ if (commonTokenResult != null) {
+ data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
+ data.put(Cols.NUM_UNIQUE_COMMON_TOKENS, Integer.toString(commonTokenResult.getUniqueCommonTokens()));
+ data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
+ data.put(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS,
+ Integer.toString(commonTokenResult.getUniqueAlphabeticTokens()));
+ data.put(Cols.NUM_ALPHABETIC_TOKENS,
+ Integer.toString(commonTokenResult.getAlphabeticTokens()));
+ }
+ TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class);
+ if (tokenCounts != null) {
+
+ data.put(Cols.NUM_UNIQUE_TOKENS,
+ Integer.toString(tokenCounts.getTotalUniqueTokens()));
+ data.put(Cols.NUM_TOKENS,
+ Integer.toString(tokenCounts.getTotalTokens()));
+ }
+ if (textStats.get(TokenEntropy.class) != null) {
+ data.put(Cols.TOKEN_ENTROPY_RATE,
+ Double.toString((Double) textStats.get(TokenEntropy.class)));
+ }
+
+ SummaryStatistics summStats = (SummaryStatistics) textStats.get(TokenLengths.class);
+ if (summStats != null) {
+ data.put(Cols.TOKEN_LENGTH_SUM,
+ Integer.toString((int) summStats.getSum()));
+
+ data.put(Cols.TOKEN_LENGTH_MEAN,
+ Double.toString(summStats.getMean()));
+
+ data.put(Cols.TOKEN_LENGTH_STD_DEV,
+ Double.toString(summStats.getStandardDeviation()));
+ }
+ unicodeBlocks(textStats, data);
+ try {
+ writer.writeRow(contentsTable, data);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ void writeTagData(String fileId, ContentTags contentTags, TableInfo tagsTable) {
+ Map<String, Integer> tags = contentTags.getTags();
+ if (tags.size() == 0 && contentTags.getParseException() == false) {
+ return;
+ }
+ Map<Cols, String> data = new HashMap<>();
+ data.put(Cols.ID, fileId);
+
+ for (Map.Entry<String, Cols> e : UC_TAGS_OF_INTEREST.entrySet()) {
+ Integer count = tags.get(e.getKey());
+ if (count == null) {
+ data.put(e.getValue(), ZERO);
+ } else {
+ data.put(e.getValue(), Integer.toString(count));
+ }
+ }
+
+ if (contentTags.getParseException()) {
+ data.put(Cols.TAGS_PARSE_EXCEPTION, TRUE);
+ } else {
+ data.put(Cols.TAGS_PARSE_EXCEPTION, FALSE);
+ }
+ try {
+ writer.writeRow(tagsTable, data);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+
+ String getTime(Metadata m) {
+ String elapsed = "-1";
+
+ String v = m.get(AbstractRecursiveParserWrapperHandler.PARSE_TIME_MILLIS);
+ if (v != null) {
+ return v;
+ }
+ return elapsed;
+ }
+
+ int countMetadataValues(Metadata m) {
+ if (m == null) {
+ return 0;
+ }
+ int i = 0;
+ for (String n : m.names()) {
+ i += m.getValues(n).length;
+ }
+ return i;
+ }
+
+ void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {
+
+ String fullTrace = metadata.get(RecursiveParserWrapperHandler.CONTAINER_EXCEPTION);
+
+ if (fullTrace == null) {
+ fullTrace = metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION);
+ }
+
+ if (fullTrace != null) {
+ //check for "expected" exceptions...exceptions
+ //that can't be fixed.
+ //Do not store trace for "expected" exceptions
+
+ Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace);
+ if (matcher.find()) {
+ data.put(Cols.PARSE_EXCEPTION_ID,
+ Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()));
+ return;
+ }
+ matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace);
+ if (matcher.find()) {
+ data.put(Cols.PARSE_EXCEPTION_ID,
+ Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal()));
+ return;
+ }
+
+ data.put(Cols.PARSE_EXCEPTION_ID,
+ Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal()));
+
+ data.put(Cols.ORIG_STACK_TRACE, fullTrace);
+ //TikaExceptions can have object ids, as in the "@2b1ea6ee" in:
+ //org.apache.tika.exception.TikaException: TIKA-198: Illegal
+ //IOException from org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee
+ //For reporting purposes, let's snip off the object id so that we can more
+ //easily count exceptions.
+ String sortTrace = EvalExceptionUtils.normalize(fullTrace);
+ data.put(Cols.SORT_STACK_TRACE, sortTrace);
+ }
+ }
+
+ /**
+ * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
+ *
+ * @param contentTags
+ * @param maxLength
+ * @param data
+ * @return
+ */
+ protected static String truncateContent(ContentTags contentTags, int maxLength, Map<Cols, String> data) {
+ data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
+ if (contentTags == null) {
+ return "";
+ }
+ String c = contentTags.getContent();
+ if (maxLength > -1 && c.length() > maxLength) {
+ c = c.substring(0, maxLength);
+ data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
+ }
+ return c;
+
+ }
+
+ protected static ContentTags getContent(EvalFilePaths evalFilePaths, Metadata metadata) {
+ if (metadata == null) {
+ return ContentTags.EMPTY_CONTENT_TAGS;
+ }
+ return parseContentAndTags(evalFilePaths, metadata);
+ }
+
+ void unicodeBlocks(Map<Class, Object> tokenStats, Map<Cols, String> data) {
+
+ Map<String, MutableInt> blocks = (Map<String, MutableInt>) tokenStats.get(UnicodeBlockCounter.class);
+ List<Pair<String, Integer>> pairs = new ArrayList<>();
+ for (Map.Entry<String, MutableInt> e : blocks.entrySet()) {
+ pairs.add(Pair.of(e.getKey(), e.getValue().intValue()));
+ }
+ pairs.sort((o1, o2) -> o2.getValue().compareTo(o1.getValue()));
+ StringBuilder sb = new StringBuilder();
+
+ for (int i = 0; i < 20 && i < pairs.size(); i++) {
+ if (i > 0) {
+ sb.append(" | ");
+ }
+ sb.append(pairs.get(i).getKey() + ": " + pairs.get(i).getValue());
+ }
+ data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
+ }
+
+ void langid(Map<Class, Object> stats, Map<Cols, String> data) {
+ List<LanguageResult> probabilities = (List<LanguageResult>) stats.get(LanguageIDWrapper.class);
+
+ if (probabilities.size() > 0) {
+ data.put(Cols.LANG_ID_1, probabilities.get(0).getLanguage());
+ data.put(Cols.LANG_ID_PROB_1,
+ Double.toString(probabilities.get(0).getRawScore()));
+ }
+ if (probabilities.size() > 1) {
+ data.put(Cols.LANG_ID_2, probabilities.get(1).getLanguage());
+ data.put(Cols.LANG_ID_PROB_2,
+ Double.toString(probabilities.get(1).getRawScore()));
+ }
+ }
+
+ void getFileTypes(Metadata metadata, Map<Cols, String> output) {
+ if (metadata == null) {
+ return;
+ }
+ String type = metadata.get(Metadata.CONTENT_TYPE);
+ if (type == null) {
+ return;
+ }
+ int mimeId = writer.getMimeId(type);
+ output.put(Cols.MIME_ID, Integer.toString(mimeId));
+ }
+
+ void writeTokenCounts(Map<Class, Object> textStats, Map<Cols, String> data) {
+ TokenIntPair[] tokenIntPairs = (TokenIntPair[]) textStats.get(TopNTokens.class);
+ int i = 0;
+ StringBuilder sb = new StringBuilder();
+ for (TokenIntPair t : tokenIntPairs) {
+ if (i++ > 0) {
+ sb.append(" | ");
+ }
+ sb.append(t.getToken() + ": " + t.getValue());
+ }
+
+ data.put(Cols.TOP_N_TOKENS, sb.toString());
+ }
+
+
+ public void closeWriter() throws IOException {
+ writer.close();
+ }
+
+
+ /**
+ * @param metadata
+ * @param extracts
+ * @return evalfilepaths for files if crawling an extract directory
+ */
+ protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata,
+ Path extracts) {
+ String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
+ Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
+ Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
+ //just try slapping the relextractfilepath on the extractdir
+ Path extractFile = extracts.resolve(relExtractFilePath);
+ if (!Files.isRegularFile(extractFile)) {
+ //if that doesn't work, try to find the right extract file.
+ //This is necessary if crawling extractsA and trying to find a file in
+ //extractsB that is not in the same format: json vs txt or compressed
+ extractFile = findFile(extracts, relativeSourceFilePath);
+ }
+ return new EvalFilePaths(relativeSourceFilePath, extractFile);
+ }
+
+ //call this if the crawler is crawling through the src directory
+ protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir,
+ Path extracts) {
+ Path relativeSourceFilePath = Paths.get(metadata.get(FSProperties.FS_REL_PATH));
+ Path extractFile = findFile(extracts, relativeSourceFilePath);
+ Path inputFile = srcDir.resolve(relativeSourceFilePath);
+ long srcLen = -1l;
+ //try to get the length of the source file in case there was an error
+ //in both extracts
+ try {
+ srcLen = Files.size(inputFile);
+ } catch (IOException e) {
+ LOG.warn("Couldn't get length for: {}", inputFile.toAbsolutePath());
+ }
+ return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen);
+ }
+
+ /**
+ * @param extractRootDir
+ * @param relativeSourceFilePath
+ * @return extractFile or null if couldn't find one.
+ */
+ private Path findFile(Path extractRootDir, Path relativeSourceFilePath) {
+ String relSrcFilePathString = relativeSourceFilePath.toString();
+ if (lastExtractExtension != null) {
+ Path candidate = extractRootDir.resolve(relSrcFilePathString + lastExtractExtension);
+ if (Files.isRegularFile(candidate)) {
+ return candidate;
+ }
+ }
+ for (String ext : EXTRACT_EXTENSIONS) {
+ for (String compress : COMPRESSION_EXTENSIONS) {
+ Path candidate = extractRootDir.resolve(relSrcFilePathString + ext + compress);
+ if (Files.isRegularFile(candidate)) {
+ lastExtractExtension = ext + compress;
+ return candidate;
+ }
+ }
+ }
+ return null;
+ }
+
+ protected long getSourceFileLength(EvalFilePaths fps, List<Metadata> metadataList) {
+ if (fps.getSourceFileLength() > NON_EXISTENT_FILE_LENGTH) {
+ return fps.getSourceFileLength();
+ }
+ return getSourceFileLength(metadataList);
+ }
+
+ long getSourceFileLength(List<Metadata> metadataList) {
+ if (metadataList == null || metadataList.size() < 1) {
+ return NON_EXISTENT_FILE_LENGTH;
+ }
+ return getSourceFileLength(metadataList.get(0));
+ }
+
+ long getSourceFileLength(Metadata m) {
+ String lenString = m.get(Metadata.CONTENT_LENGTH);
+ if (lenString == null) {
+ return NON_EXISTENT_FILE_LENGTH;
+ }
+ try {
+ return Long.parseLong(lenString);
+ } catch (NumberFormatException e) {
+ //swallow
+ }
+ return NON_EXISTENT_FILE_LENGTH;
+ }
+
+ protected long getFileLength(Path p) {
+ if (p != null && Files.isRegularFile(p)) {
+ try {
+ return Files.size(p);
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+ return NON_EXISTENT_FILE_LENGTH;
+ }
+
+ /**
+ * @param list
+ * @return empty list if input list is empty or null
+ */
+ static List<Integer> countAttachments(List<Metadata> list) {
+ List<Integer> ret = new ArrayList<>();
+ if (list == null || list.size() == 0) {
+ return ret;
+ }
+ //container document attachment count = list.size()-1
+ ret.add(list.size() - 1);
+
+ Map<String, Integer> counts = new HashMap<>();
+ for (int i = 1; i < list.size(); i++) {
+ String path = list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
+ if (path == null) {
+ //shouldn't ever happen
+ continue;
+ }
+ String[] parts = path.split("/");
+ StringBuilder parent = new StringBuilder();
+ for (int end = 1; end < parts.length - 1; end++) {
+ parent.setLength(0);
+ join("/", parent, parts, 1, end);
+ String parentPath = parent.toString();
+ Integer count = counts.get(parentPath);
+ if (count == null) {
+ count = 1;
+ } else {
+ count++;
+ }
+ counts.put(parentPath, count);
+ }
+ }
+
+ for (int i = 1; i < list.size(); i++) {
+ Integer count = counts.get(list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
+ if (count == null) {
+ count = 0;
+ }
+ ret.add(i, count);
+ }
+ return ret;
+
+
+ }
+
+ private static void join(String delimiter, StringBuilder sb, String[] parts, int start, int end) {
+ for (int i = start; i <= end; i++) {
+ sb.append(delimiter);
+ sb.append(parts[i]);
+ }
+ }
+
+ private static ContentTags parseContentAndTags(EvalFilePaths evalFilePaths, Metadata metadata) {
+ String s = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT);
+ if (s == null || s.length() == 0) {
+ return ContentTags.EMPTY_CONTENT_TAGS;
+ }
+
+ String handlerClass = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER);
+ if (evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".html")) {
+ try {
+ return ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
+ } catch (IOException | SAXException e) {
+ LOG.warn("Problem parsing html in {}; backing off to treat string as text",
+ evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
+
+ return new ContentTags(s, true);
+ }
+ } else if (
+ evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".xhtml") ||
+ (handlerClass != null && handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
+ try {
+ return ContentTagParser.parseXML(s, UC_TAGS_OF_INTEREST.keySet());
+ } catch (TikaException | IOException | SAXException e) {
+ LOG.warn("Problem parsing xhtml in {}; backing off to html parser",
+ evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
+ try {
+ ContentTags contentTags = ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
+ contentTags.setParseException(true);
+ return contentTags;
+ } catch (IOException | SAXException e2) {
+ LOG.warn("Problem parsing html in {}; backing off to treat string as text",
+ evalFilePaths.getExtractFile().toAbsolutePath().toString(), e2);
+ }
+ return new ContentTags(s, true);
+ }
+ }
+ return new ContentTags(s);
+ }
+
+
+}
+
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalFilePaths.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalFilePaths.java
new file mode 100644
index 0000000..e05de44
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/EvalFilePaths.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app;
+
+import static org.apache.tika.eval.app.AbstractProfiler.NON_EXISTENT_FILE_LENGTH;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Objects;
+
+/**
+ * Simple struct to keep track of relative path of source file (
+ * original binary file, e.g. /subdir/document1.doc)
+ * and the extract file (e.g. /subdir/document1.doc.json).
+ */
+class EvalFilePaths {
+
+ private final Path relativeSourceFilePath;
+ private final Path extractFile;
+
+ private long sourceFileLength = NON_EXISTENT_FILE_LENGTH;
+ private long extractFileLength = NON_EXISTENT_FILE_LENGTH;
+
+
+ public EvalFilePaths(Path relativeSourceFilePath, Path extractFile, long srcFileLen) {
+ this(relativeSourceFilePath, extractFile);
+ this.sourceFileLength = srcFileLen;
+ }
+
+ public EvalFilePaths(Path relativeSourceFilePath, Path extractFile) {
+ if (extractFile != null && Files.isRegularFile(extractFile)) {
+ try {
+ extractFileLength = Files.size(extractFile);
+ } catch (IOException e) {
+ //swallow ?
+ }
+ }
+ this.relativeSourceFilePath = relativeSourceFilePath;
+ this.extractFile = extractFile;
+ }
+
+ public Path getRelativeSourceFilePath() {
+ return relativeSourceFilePath;
+ }
+
+ //this path may or may not exist and it could be null!
+ public Path getExtractFile() {
+ return extractFile;
+ }
+
+ //if it doesn't exist, it'll be -1l.
+ public long getSourceFileLength() {
+ return sourceFileLength;
+ }
+
+ public long getExtractFileLength() {
+ return extractFileLength;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ EvalFilePaths that = (EvalFilePaths) o;
+
+ if (sourceFileLength != that.sourceFileLength) {
+ return false;
+ }
+ if (extractFileLength != that.extractFileLength) {
+ return false;
+ }
+ if (!Objects.equals(relativeSourceFilePath, that.relativeSourceFilePath)) {
+ return false;
+ }
+ return Objects.equals(extractFile, that.extractFile);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = relativeSourceFilePath != null ? relativeSourceFilePath.hashCode() : 0;
+ result = 31 * result + (extractFile != null ? extractFile.hashCode() : 0);
+ result = 31 * result + (int) (sourceFileLength ^ (sourceFileLength >>> 32));
+ result = 31 * result + (int) (extractFileLength ^ (extractFileLength >>> 32));
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "EvalFilePaths{" +
+ "relativeSourceFilePath=" + relativeSourceFilePath +
+ ", extractFile=" + extractFile +
+ ", sourceFileLength=" + sourceFileLength +
+ ", extractFileLength=" + extractFileLength +
+ '}';
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
new file mode 100644
index 0000000..4b827f7
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
@@ -0,0 +1,538 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.ExtractReader;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator;
+import org.apache.tika.eval.core.tokens.ContrastStatistics;
+import org.apache.tika.eval.core.tokens.TokenContraster;
+import org.apache.tika.eval.core.tokens.TokenCounts;
+import org.apache.tika.eval.core.tokens.TokenIntPair;
+import org.apache.tika.eval.core.util.ContentTags;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+
+public class ExtractComparer extends AbstractProfiler {
+
+ static Options OPTIONS;
+ static {
+ Option extractsA = new Option("extractsA", true, "directory for extractsA files");
+ extractsA.setRequired(true);
+
+ Option extractsB = new Option("extractsB", true, "directory for extractsB files");
+ extractsB.setRequired(true);
+
+ Option inputDir = new Option("inputDir", true,
+ "optional: directory of original binary input files if it exists " +
+ "or can be the same as -extractsA or -extractsB. If not specified, -inputDir=-extractsA");
+
+
+ OPTIONS = new Options()
+ .addOption(extractsA)
+ .addOption(extractsB)
+ .addOption(inputDir)
+ .addOption("bc", "optional: tika-batch config file")
+ .addOption("numConsumers", true, "optional: number of consumer threads")
+ .addOption(new Option("alterExtract", true,
+ "for json-formatted extract files, " +
+ "process full metadata list ('as_is'=default), " +
+ "take just the first/container document ('first_only'), " +
+ "concatenate all content into the first metadata item ('concatenate_content')"))
+ .addOption("minExtractLength", true, "minimum extract length to process (in bytes)")
+ .addOption("maxExtractLength", true, "maximum extract length to process (in bytes)")
+ .addOption("db", true, "db file to which to write results")
+ .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
+ .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
+ .addOption("tablePrefixA", true, "EXPERT: optional prefix for table names for A")
+ .addOption("tablePrefixB", true, "EXPERT: optional prefix for table names for B")
+ .addOption("drop", false, "drop tables if they exist")
+ .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
+ .addOption("maxTokens", true, "maximum tokens to process, default=200000")
+ .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
+ .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
+ .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")
+ ;
+ }
+
+ private static final String DIGEST_KEY_PREFIX = TikaCoreProperties.TIKA_META_PREFIX+
+ "digest"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+ public static void USAGE() {
+ HelpFormatter helpFormatter = new HelpFormatter();
+ helpFormatter.printHelp(
+ 80,
+ "java -jar tika-eval-x.y.jar Compare -extractsA extractsA -extractsB extractsB -db mydb",
+ "Tool: Compare",
+ ExtractComparer.OPTIONS,
+ "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
+ }
+
+ private final static String FIELD_A = "fa";
+ private final static String FIELD_B = "fb";
+
+ public static TableInfo REF_PAIR_NAMES = new TableInfo("pair_names",
+ new ColInfo(Cols.DIR_NAME_A, Types.VARCHAR, 128),
+ new ColInfo(Cols.DIR_NAME_B, Types.VARCHAR, 128)
+ );
+
+ public static TableInfo COMPARISON_CONTAINERS = new TableInfo("containers",
+ new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
+ new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
+ new ColInfo(Cols.LENGTH, Types.BIGINT),
+ new ColInfo(Cols.EXTRACT_FILE_LENGTH_A, Types.BIGINT),
+ new ColInfo(Cols.EXTRACT_FILE_LENGTH_B, Types.BIGINT)
+ );
+
+ public static TableInfo CONTENT_COMPARISONS = new TableInfo("content_comparisons",
+ new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, Types.VARCHAR, 1024),
+ new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, Types.VARCHAR, 1024),
+ new ColInfo(Cols.TOP_10_MORE_IN_A, Types.VARCHAR, 1024),
+ new ColInfo(Cols.TOP_10_MORE_IN_B, Types.VARCHAR, 1024),
+ new ColInfo(Cols.DICE_COEFFICIENT, Types.FLOAT),
+ new ColInfo(Cols.OVERLAP, Types.FLOAT)
+ );
+
+ public static TableInfo PROFILES_A = new TableInfo( "profiles_a",
+ ExtractProfiler.PROFILE_TABLE.getColInfos());
+
+ public static TableInfo PROFILES_B = new TableInfo( "profiles_b",
+ ExtractProfiler.PROFILE_TABLE.getColInfos());
+
+ public static TableInfo EMBEDDED_FILE_PATH_TABLE_A = new TableInfo( "emb_path_a",
+ ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
+
+ public static TableInfo EMBEDDED_FILE_PATH_TABLE_B = new TableInfo( "emb_path_b",
+ ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
+
+
+ public static TableInfo CONTENTS_TABLE_A = new TableInfo( "contents_a",
+ ExtractProfiler.CONTENTS_TABLE.getColInfos());
+
+ public static TableInfo CONTENTS_TABLE_B = new TableInfo( "contents_b",
+ ExtractProfiler.CONTENTS_TABLE.getColInfos());
+
+ public static TableInfo TAGS_TABLE_A = new TableInfo( "tags_a",
+ ExtractProfiler.TAGS_TABLE.getColInfos());
+
+ public static TableInfo TAGS_TABLE_B = new TableInfo( "tags_b",
+ ExtractProfiler.TAGS_TABLE.getColInfos());
+
+
+ public static TableInfo EXCEPTION_TABLE_A = new TableInfo ("exceptions_a",
+ ExtractProfiler.EXCEPTION_TABLE.getColInfos());
+
+ public static TableInfo EXCEPTION_TABLE_B = new TableInfo ("exceptions_b",
+ ExtractProfiler.EXCEPTION_TABLE.getColInfos());
+
+ public static TableInfo EXTRACT_EXCEPTION_TABLE_A = new TableInfo("extract_exceptions_a",
+ ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
+ public static TableInfo EXTRACT_EXCEPTION_TABLE_B = new TableInfo("extract_exceptions_b",
+ ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
+
+
+ //need to parameterize?
+ private final TikaConfig config = TikaConfig.getDefaultConfig();
+
+ private final Path inputDir;
+ private final Path extractsA;
+ private final Path extractsB;
+
+ private final TokenContraster tokenContraster = new TokenContraster();
+ private final ExtractReader extractReader;
+
+ public ExtractComparer(ArrayBlockingQueue<FileResource> queue,
+ Path inputDir, Path extractsA, Path extractsB, ExtractReader extractReader,
+ IDBWriter writer) {
+ super(queue, writer);
+ this.inputDir = inputDir;
+ this.extractsA = extractsA;
+ this.extractsB = extractsB;
+ this.extractReader = extractReader;
+ }
+
+ @Override
+ public boolean processFileResource(FileResource fileResource) {
+ Metadata metadata = fileResource.getMetadata();
+ EvalFilePaths fpsA = null;
+ EvalFilePaths fpsB = null;
+
+ if (inputDir != null && (inputDir.equals(extractsA) ||
+ inputDir.equals(extractsB))) {
+ //crawling an extract dir
+ fpsA = getPathsFromExtractCrawl(metadata, extractsA);
+ fpsB = getPathsFromExtractCrawl(metadata, extractsB);
+
+ } else {
+ fpsA = getPathsFromSrcCrawl(metadata, inputDir, extractsA);
+ fpsB = getPathsFromSrcCrawl(metadata, inputDir, extractsB);
+ }
+
+ try {
+ compareFiles(fpsA, fpsB);
+ } catch (Throwable e) {
+ //this should be cataclysmic...
+ throw new RuntimeException("Exception while working on: " +
+ metadata.get(FSProperties.FS_REL_PATH), e);
+ }
+ return true;
+ }
+
+ //protected for testing, should find better way so that this can be private!
+ protected void compareFiles(EvalFilePaths fpsA,
+ EvalFilePaths fpsB) throws IOException {
+
+ ExtractReaderException.TYPE extractExceptionA = null;
+ ExtractReaderException.TYPE extractExceptionB = null;
+
+ List<Metadata> metadataListA = null;
+ if (extractExceptionA == null) {
+ try {
+ metadataListA = extractReader.loadExtract(fpsA.getExtractFile());
+ } catch (ExtractReaderException e) {
+ extractExceptionA = e.getType();
+ }
+ }
+
+ List<Metadata> metadataListB = null;
+ try {
+ metadataListB = extractReader.loadExtract(fpsB.getExtractFile());
+ } catch (ExtractReaderException e) {
+ extractExceptionB = e.getType();
+ }
+
+ //array indices for those metadata items handled in B
+ Set<Integer> handledB = new HashSet<>();
+ String containerID = Integer.toString(ID.getAndIncrement());
+ //container table
+ Map<Cols, String> contData = new HashMap<>();
+ contData.put(Cols.CONTAINER_ID, containerID);
+ contData.put(Cols.FILE_PATH, fpsA.getRelativeSourceFilePath().toString());
+ long srcFileLength = getSourceFileLength(metadataListA, metadataListB);
+ contData.put(Cols.LENGTH,
+ srcFileLength > NON_EXISTENT_FILE_LENGTH ?
+ Long.toString(srcFileLength) : "");
+ contData.put(Cols.FILE_EXTENSION,
+ FilenameUtils.getExtension(fpsA.getRelativeSourceFilePath().getFileName().toString()));
+
+ long extractFileLengthA = getFileLength(fpsA.getExtractFile());
+ contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > NON_EXISTENT_FILE_LENGTH ?
+ Long.toString(extractFileLengthA) : "");
+
+ long extractFileLengthB = getFileLength(fpsB.getExtractFile());
+ contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > NON_EXISTENT_FILE_LENGTH ?
+ Long.toString(extractFileLengthB) : "");
+
+ writer.writeRow(COMPARISON_CONTAINERS, contData);
+
+ if (extractExceptionA != null) {
+ writeExtractException(EXTRACT_EXCEPTION_TABLE_A, containerID, fpsA.getRelativeSourceFilePath().toString(),
+ extractExceptionA);
+ }
+ if (extractExceptionB != null) {
+ writeExtractException(EXTRACT_EXCEPTION_TABLE_B, containerID, fpsB.getRelativeSourceFilePath().toString(),
+ extractExceptionB);
+ }
+
+ if (metadataListA == null && metadataListB == null) {
+ return;
+ }
+ List<Integer> numAttachmentsA = countAttachments(metadataListA);
+ List<Integer> numAttachmentsB = countAttachments(metadataListB);
+
+ String sharedDigestKey = findSharedDigestKey(metadataListA, metadataListB);
+ Map<Class, Object> tokenStatsA = null;
+ Map<Class, Object> tokenStatsB = null;
+ //now get that metadata
+ if (metadataListA != null) {
+ for (int i = 0; i < metadataListA.size(); i++) {
+ //the first file should have the same id as the container id
+ String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
+ Metadata metadataA = metadataListA.get(i);
+ ContentTags contentTagsA = getContent(fpsA, metadataA);
+ ContentTags contentTagsB = ContentTags.EMPTY_CONTENT_TAGS;
+ Metadata metadataB = null;
+
+ //TODO: shouldn't be fileA!!!!
+ writeTagData(fileId, contentTagsA, TAGS_TABLE_A);
+
+ writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
+ writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
+ int matchIndex = getMatch(i, sharedDigestKey,
+ handledB, metadataListA, metadataListB);
+
+ if (matchIndex > -1 && ! handledB.contains(matchIndex)) {
+ metadataB = metadataListB.get(matchIndex);
+ handledB.add(matchIndex);
+ }
+ if (metadataB != null) {
+ contentTagsB = getContent(fpsB, metadataB);
+ writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
+ writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
+ writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
+ }
+ writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
+ //write content
+ try {
+ tokenStatsA = calcTextStats(contentTagsA);
+ writeContentData(fileId, tokenStatsA, CONTENTS_TABLE_A);
+ tokenStatsB = calcTextStats(contentTagsB);
+ if (metadataB != null) {
+ writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ if (metadataB != null) {
+ TokenCounts tokenCountsA = (TokenCounts) tokenStatsA.get(BasicTokenCountStatsCalculator.class);
+ TokenCounts tokenCountsB = (TokenCounts) tokenStatsB.get(BasicTokenCountStatsCalculator.class);
+ //arbitrary decision...only run the comparisons if there are > 10 tokens total
+ //We may want to bump that value a bit higher?
+ //now run comparisons
+ if (tokenCountsA.getTotalTokens()
+ + tokenCountsB.getTotalTokens() > 10) {
+ Map<Cols, String> data = new HashMap<>();
+ data.put(Cols.ID, fileId);
+
+ ContrastStatistics contrastStatistics =
+ tokenContraster.calculateContrastStatistics(
+ tokenCountsA,
+ tokenCountsB);
+
+ writeContrasts(data, contrastStatistics);
+ writer.writeRow(CONTENT_COMPARISONS, data);
+ }
+ }
+ }
+ }
+ //now try to get any Metadata objects in B
+ //that haven't yet been handled.
+ if (metadataListB != null) {
+ for (int i = 0; i < metadataListB.size(); i++) {
+ if (handledB.contains(i)) {
+ continue;
+ }
+ Metadata metadataB = metadataListB.get(i);
+ ContentTags contentTagsB = getContent(fpsB, metadataB);
+ //the first file should have the same id as the container id
+ String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
+ writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
+ writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
+ writeEmbeddedFilePathData(i, fileId, null, metadataB);
+ writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
+
+ //write content
+ try {
+ tokenStatsB = calcTextStats(contentTagsB);
+ writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+ }
+
+ /**
+ * Checks only the first item in each list. Returns the first
+ * digest key shared by both, if it exists, null otherwise.
+ * @param metadataListA
+ * @param metadataListB
+ * @return
+ */
+ private String findSharedDigestKey(List<Metadata> metadataListA, List<Metadata> metadataListB) {
+ if (metadataListB == null || metadataListB.size() == 0) {
+ return null;
+ }
+ Set<String> digestA = new HashSet<>();
+ if (metadataListA != null) {
+ for (String n : metadataListA.get(0).names()) {
+ if (n.startsWith(DIGEST_KEY_PREFIX)) {
+ digestA.add(n);
+ }
+ }
+ }
+ Metadata bMain = metadataListB.get(0);
+ for (String n : bMain.names()) {
+ if (digestA.contains(n)) {
+ return n;
+ }
+ }
+ return null;
+ }
+
+ private void writeEmbeddedFilePathData(int i, String fileId, Metadata mA, Metadata mB) {
+ //container file, don't write anything
+ if (i == 0) {
+ return;
+ }
+ String pathA = null;
+ String pathB = null;
+ if (mA != null) {
+ pathA = mA.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
+ }
+ if (mB != null) {
+ pathB = mB.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
+ }
+ if (pathA != null) {
+ Map<Cols, String> d = new HashMap<>();
+ d.put(Cols.ID, fileId);
+ d.put(Cols.EMBEDDED_FILE_PATH, pathA);
+ try {
+ writer.writeRow(EMBEDDED_FILE_PATH_TABLE_A, d);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ if (pathB != null &&
+ (pathA == null || ! pathA.equals(pathB))) {
+ Map<Cols, String> d = new HashMap<>();
+ d.put(Cols.ID, fileId);
+ d.put(Cols.EMBEDDED_FILE_PATH, pathB);
+ try {
+ writer.writeRow(EMBEDDED_FILE_PATH_TABLE_B, d);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ private long getSourceFileLength(List<Metadata> metadataListA, List<Metadata> metadataListB) {
+ long len = getSourceFileLength(metadataListA);
+ if (len > NON_EXISTENT_FILE_LENGTH) {
+ return len;
+ }
+ return getSourceFileLength(metadataListB);
+ }
+
+
+ /**
+ * Try to find the matching metadata based on the AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH
+ * If you can't find it, return -1;
+ *
+ * @param aIndex index for match in metadataListA
+ * @param metadataListA
+ * @param metadataListB
+ * @return
+ */
+ private int getMatch(int aIndex, String sharedDigestKey, Set<Integer> handledB,
+ List<Metadata> metadataListA,
+ List<Metadata> metadataListB) {
+ //TODO: could make this more robust
+ if (metadataListB == null || metadataListB.size() == 0) {
+ return -1;
+ }
+ //assume first is always the container file
+ if (aIndex == 0) {
+ return 0;
+ }
+
+ if (sharedDigestKey != null) {
+ //first try to find matching digests
+ //this does not elegantly handle multiple matching digests
+ return findMatchingDigests(sharedDigestKey, handledB,
+ metadataListA.get(aIndex), metadataListB);
+ }
+
+ //assume same embedded resource path. Not always true!
+ Metadata thisMetadata = metadataListA.get(aIndex);
+ String embeddedPath = thisMetadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
+ if (embeddedPath != null) {
+ for (int j = 0; j < metadataListB.size(); j++) {
+ String thatEmbeddedPath = metadataListB.get(j).get(
+ AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
+ if (embeddedPath.equals(thatEmbeddedPath)) {
+ return j;
+ }
+ }
+ }
+
+ //last resort, if lists are same size, guess the same index
+ if (metadataListA.size() == metadataListB.size()) {
+ //assume no rearrangments if lists are the same size
+ return aIndex;
+ }
+ return -1;
+ }
+
+ private int findMatchingDigests(String sharedDigestKey,
+ Set<Integer> handledB,
+ Metadata metadata, List<Metadata> metadataListB) {
+ String digestA = metadata.get(sharedDigestKey);
+ if (digestA == null) {
+ return -1;
+ }
+
+ for (int i = 0; i < metadataListB.size(); i++) {
+ if (handledB.contains(i)) {
+ continue;
+ }
+ Metadata mB = metadataListB.get(i);
+ String digestB = mB.get(sharedDigestKey);
+ if (digestA.equalsIgnoreCase(digestB)) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ private void writeContrasts(Map<Cols, String> data, ContrastStatistics contrastStatistics) {
+ writeContrastString(data, Cols.TOP_10_MORE_IN_A, contrastStatistics.getTopNMoreA());
+ writeContrastString(data, Cols.TOP_10_MORE_IN_B, contrastStatistics.getTopNMoreB());
+ writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, contrastStatistics.getTopNUniqueA());
+ writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, contrastStatistics.getTopNUniqueB());
+ data.put(Cols.OVERLAP, Double.toString(contrastStatistics.getOverlap()));
+ data.put(Cols.DICE_COEFFICIENT, Double.toString(contrastStatistics.getDiceCoefficient()));
+
+ }
+
+ private void writeContrastString(Map<Cols, String> data, Cols col, TokenIntPair[] tokenIntPairs) {
+
+ int i = 0;
+ StringBuilder sb = new StringBuilder();
+ for (TokenIntPair p : tokenIntPairs) {
+ if (i++ > 0) {
+ sb.append(" | ");
+ }
+ sb.append(p.getToken()).append(": ").append(p.getValue());
+ }
+ data.put(col, sb.toString());
+ }
+}
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
new file mode 100644
index 0000000..029162e
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -0,0 +1,281 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.ExtractReader;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.eval.core.util.ContentTags;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+
+public class ExtractProfiler extends AbstractProfiler {
+
+ static Options OPTIONS;
+ static {
+ //By the time this commandline is parsed, there should be both an extracts and an inputDir
+ Option extracts = new Option("extracts", true, "directory for extract files");
+ extracts.setRequired(true);
+
+ Option inputDir = new Option("inputDir", true,
+ "optional: directory for original binary input documents."+
+ " If not specified, -extracts is crawled as is.");
+
+ OPTIONS = new Options()
+ .addOption(extracts)
+ .addOption(inputDir)
+ .addOption("bc", "optional: tika-batch config file")
+ .addOption("numConsumers", true, "optional: number of consumer threads")
+ .addOption(new Option("alterExtract", true,
+ "for json-formatted extract files, " +
+ "process full metadata list ('as_is'=default), " +
+ "take just the first/container document ('first_only'), " +
+ "concatenate all content into the first metadata item ('concatenate_content')"))
+ .addOption("minExtractLength", true, "minimum extract length to process (in bytes)")
+ .addOption("maxExtractLength", true, "maximum extract length to process (in bytes)")
+ .addOption("db", true, "db file to which to write results")
+ .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
+ .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
+ .addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
+ .addOption("drop", false, "drop tables if they exist")
+ .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
+ .addOption("maxTokens", true, "maximum tokens to process, default=200000")
+ .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
+ .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
+ .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")
+
+ ;
+
+ }
+
+ public static void USAGE() {
+ HelpFormatter helpFormatter = new HelpFormatter();
+ helpFormatter.printHelp(
+ 80,
+ "java -jar tika-eval-x.y.jar Profile -extracts extracts -db mydb [-inputDir input]",
+ "Tool: Profile",
+ ExtractProfiler.OPTIONS,
+ "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
+ }
+
+ private final static String FIELD = "f";
+
+ public static TableInfo EXTRACT_EXCEPTION_TABLE = new TableInfo("extract_exceptions",
+ new ColInfo(Cols.CONTAINER_ID, Types.INTEGER),
+ new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
+ new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER),
+ new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER)
+ );
+
+ public static TableInfo EXCEPTION_TABLE = new TableInfo("parse_exceptions",
+ new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.ORIG_STACK_TRACE, Types.VARCHAR, 8192),
+ new ColInfo(Cols.SORT_STACK_TRACE, Types.VARCHAR, 8192),
+ new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER)
+ );
+
+
+ public static TableInfo CONTAINER_TABLE = new TableInfo("containers",
+ new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
+ new ColInfo(Cols.LENGTH, Types.BIGINT),
+ new ColInfo(Cols.EXTRACT_FILE_LENGTH, Types.BIGINT)
+ );
+
+ public static TableInfo PROFILE_TABLE = new TableInfo("profiles",
+ new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.CONTAINER_ID, Types.INTEGER),
+ new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 256),
+ new ColInfo(Cols.MD5, Types.CHAR, 32),
+ new ColInfo(Cols.LENGTH, Types.BIGINT),
+ new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN),
+ new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
+ new ColInfo(Cols.MIME_ID, Types.INTEGER),
+ new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER),
+ new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER),
+ new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER),
+ new ColInfo(Cols.NUM_PAGES, Types.INTEGER),
+ new ColInfo(Cols.HAS_CONTENT, Types.BOOLEAN)
+ );
+
+ public static TableInfo EMBEDDED_FILE_PATH_TABLE = new TableInfo("emb_file_names",
+ new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024)
+ );
+
+ public static TableInfo CONTENTS_TABLE = new TableInfo("contents",
+ new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.CONTENT_LENGTH, Types.INTEGER),
+ new ColInfo(Cols.NUM_UNIQUE_TOKENS, Types.INTEGER),
+ new ColInfo(Cols.NUM_TOKENS, Types.INTEGER),
+ new ColInfo(Cols.COMMON_TOKENS_LANG, Types.VARCHAR, 12),
+ new ColInfo(Cols.NUM_UNIQUE_COMMON_TOKENS, Types.INTEGER),
+ new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER),
+ new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER),
+ new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER),
+ new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024),
+ new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12),
+ new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT),
+ new ColInfo(Cols.LANG_ID_2, Types.VARCHAR, 12),
+ new ColInfo(Cols.LANG_ID_PROB_2, Types.FLOAT),
+ new ColInfo(Cols.UNICODE_CHAR_BLOCKS, Types.VARCHAR, 1024),
+ new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT),
+ new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER),
+ new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT),
+ new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT),
+ new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)
+ );
+
+ public static TableInfo TAGS_TABLE = new TableInfo("tags",
+ new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.TAGS_A, Types.INTEGER),
+ new ColInfo(Cols.TAGS_B, Types.INTEGER),
+ new ColInfo(Cols.TAGS_DIV, Types.INTEGER),
+ new ColInfo(Cols.TAGS_I, Types.INTEGER),
+ new ColInfo(Cols.TAGS_IMG, Types.INTEGER),
+ new ColInfo(Cols.TAGS_LI, Types.INTEGER),
+ new ColInfo(Cols.TAGS_OL, Types.INTEGER),
+ new ColInfo(Cols.TAGS_P, Types.INTEGER),
+ new ColInfo(Cols.TAGS_TABLE, Types.INTEGER),
+ new ColInfo(Cols.TAGS_TD, Types.INTEGER),
+ new ColInfo(Cols.TAGS_TITLE, Types.INTEGER),
+ new ColInfo(Cols.TAGS_TR, Types.INTEGER),
+ new ColInfo(Cols.TAGS_U, Types.INTEGER),
+ new ColInfo(Cols.TAGS_UL, Types.INTEGER),
+ new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN)
+ );
+
+ private final Path inputDir;
+ private final Path extracts;
+ private final ExtractReader extractReader;
+
+ public ExtractProfiler(ArrayBlockingQueue<FileResource> queue,
+ Path inputDir, Path extracts,
+ ExtractReader extractReader, IDBWriter dbWriter) {
+ super(queue, dbWriter);
+ this.inputDir = inputDir;
+ this.extracts = extracts;
+ this.extractReader = extractReader;
+ }
+
+ @Override
+ public boolean processFileResource(FileResource fileResource) {
+ Metadata metadata = fileResource.getMetadata();
+ EvalFilePaths fps = null;
+
+ if (inputDir != null && inputDir.equals(extracts)) {
+ //crawling an extract dir
+ fps = getPathsFromExtractCrawl(metadata, extracts);
+ } else {
+ fps = getPathsFromSrcCrawl(metadata, inputDir, extracts);
+ }
+ int containerId = ID.incrementAndGet();
+ String containerIdString = Integer.toString(containerId);
+
+ ExtractReaderException.TYPE extractExceptionType = null;
+
+ List<Metadata> metadataList = null;
+ try {
+ metadataList = extractReader.loadExtract(fps.getExtractFile());
+ } catch (ExtractReaderException e) {
+ extractExceptionType = e.getType();
+ }
+
+ Map<Cols, String> contOutput = new HashMap<>();
+ Long srcFileLen = getSourceFileLength(fps, metadataList);
+ contOutput.put(Cols.LENGTH,
+ srcFileLen > NON_EXISTENT_FILE_LENGTH ?
+ Long.toString(srcFileLen): "");
+ contOutput.put(Cols.CONTAINER_ID, containerIdString);
+ contOutput.put(Cols.FILE_PATH, fps.getRelativeSourceFilePath().toString());
+
+ if (fps.getExtractFileLength() > 0) {
+ contOutput.put(Cols.EXTRACT_FILE_LENGTH,
+ (fps.getExtractFile() == null) ?
+ "" :
+ Long.toString(fps.getExtractFileLength()));
+ }
+ try {
+ writer.writeRow(CONTAINER_TABLE, contOutput);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+
+ if (extractExceptionType != null) {
+ try {
+ writeExtractException(EXTRACT_EXCEPTION_TABLE, containerIdString,
+ fps.getRelativeSourceFilePath().toString(), extractExceptionType);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return true;
+ }
+
+ List<Integer> numAttachments = countAttachments(metadataList);
+ int i = 0;
+ for (Metadata m : metadataList) {
+ ContentTags contentTags = getContent(fps, m);
+ //the first file should have the same id as the container id
+ String fileId = (i == 0) ? containerIdString : Integer.toString(ID.incrementAndGet());
+ writeTagData(fileId, contentTags, TAGS_TABLE);
+ writeProfileData(fps, i, contentTags, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
+ writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE);
+ writeExceptionData(fileId, m, EXCEPTION_TABLE);
+ try {
+ Map<Class, Object> textStats = calcTextStats(contentTags);
+ writeContentData(fileId, textStats, CONTENTS_TABLE);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ i++;
+ }
+ return true;
+ }
+
+
+ private void writeEmbeddedPathData(int i, String fileId, Metadata m,
+ TableInfo embeddedFilePathTable) {
+ if (i == 0) {
+ return;
+ }
+ Map<Cols, String> data = new HashMap<>();
+ data.put(Cols.ID, fileId);
+ data.put(Cols.EMBEDDED_FILE_PATH,
+ m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
+ try {
+ writer.writeRow(embeddedFilePathTable, data);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
new file mode 100644
index 0000000..c9e367e
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/FileProfiler.java
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.detect.FileCommandDetector;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+
+/**
+ * This class profiles actual files as opposed to extracts e.g. {@link ExtractProfiler}.
+ * This does _not_ parse files, but does run file type identification and digests the
+ * raw bytes.
+ *
+ * If the 'file' command is available on the command line, this will also run the
+ * FileCommandDetector.
+ */
+
+public class FileProfiler extends AbstractProfiler {
+//TODO: we should allow users to select digest type/encoding and file detector(s).
+
+ private static final boolean HAS_FILE = FileCommandDetector.checkHasFile();
+ private static final Logger LOG = LoggerFactory.getLogger(FileProfiler.class);
+
+ static Options OPTIONS;
+ static {
+
+ Option inputDir = new Option("inputDir", true,
+ "optional: directory for original binary input documents."+
+ " If not specified, -extracts is crawled as is.");
+
+ OPTIONS = new Options()
+ .addOption(inputDir)
+ .addOption("bc", "optional: tika-batch config file")
+ .addOption("numConsumers", true, "optional: number of consumer threads")
+ .addOption("db", true, "db file to which to write results")
+ .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
+ .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
+ .addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
+ .addOption("drop", false, "drop tables if they exist")
+ .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
+
+ ;
+
+ }
+
+ public static void USAGE() {
+ HelpFormatter helpFormatter = new HelpFormatter();
+ helpFormatter.printHelp(
+ 80,
+ "java -jar tika-eval-x.y.jar FileProfiler -inputDir docs -db mydb [-inputDir input]",
+ "Tool: Profile",
+ FileProfiler.OPTIONS,
+ "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
+ }
+
+
+
+ public static TableInfo FILE_PROFILES = HAS_FILE ?
+ new TableInfo("file_profiles",
+ new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
+ new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
+ new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
+ new ColInfo(Cols.LENGTH, Types.BIGINT),
+ new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
+ new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER),
+ new ColInfo(Cols.FILE_MIME_ID, Types.INTEGER))
+ :
+ new TableInfo("file_profiles",
+ new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
+ new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
+ new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
+ new ColInfo(Cols.LENGTH, Types.BIGINT),
+ new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
+ new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER));
+
+
+ public static TableInfo FILE_MIME_TABLE = new TableInfo("file_mimes",
+ new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
+ new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
+ );
+
+ public static final String DETECT_EXCEPTION = "detect-exception";
+ private static final Tika TIKA = new Tika();
+
+ private static final FileCommandDetector FILE_COMMAND_DETECTOR = new FileCommandDetector();
+ private final Path inputDir;
+
+ public FileProfiler(ArrayBlockingQueue<FileResource> fileQueue, Path inputDir, IDBWriter dbWriter) {
+ super(fileQueue, dbWriter);
+ this.inputDir = inputDir;
+ }
+
+
+ @Override
+ public boolean processFileResource(FileResource fileResource) {
+ String relPath = fileResource.getMetadata().get(FSProperties.FS_REL_PATH);
+ try (InputStream is = fileResource.openInputStream()) {
+ try (TikaInputStream tis = TikaInputStream.get(is)) {
+ Path path = tis.getPath();
+ Map<Cols, String> data = new HashMap<>();
+ int tikaMimeId = writer.getMimeId(detectTika(tis));
+ String fileName = "";
+ String extension = "";
+ long length = -1;
+ try {
+ fileName = FilenameUtils.getName(relPath);
+ } catch (IllegalArgumentException e) {
+ LOG.warn("bad file name: "+relPath, e);
+ }
+
+ try {
+ extension = FilenameUtils.getExtension(relPath);
+ } catch (IllegalArgumentException e) {
+ LOG.warn("bad extension: "+relPath, e);
+ }
+
+ try {
+ length = Files.size(path);
+ } catch (IOException e) {
+ LOG.warn("problem getting size: "+relPath, e);
+ }
+
+ data.put(Cols.FILE_PATH, relPath);
+ data.put(Cols.FILE_NAME, fileName);
+ data.put(Cols.FILE_EXTENSION, extension);
+ data.put(Cols.LENGTH, Long.toString(length));
+ data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
+ data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
+ if (HAS_FILE) {
+ int fileMimeId = writer.getMimeId(detectFile(tis));
+ data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId));
+ }
+ writer.writeRow(FILE_PROFILES, data);
+ }
+ } catch (IOException e) {
+ //log at least!
+ return false;
+ }
+ return true;
+ }
+
+ private String detectFile(TikaInputStream tis) {
+ try {
+ return FILE_COMMAND_DETECTOR.detect(tis, new Metadata()).toString();
+ } catch (IOException e) {
+ return DETECT_EXCEPTION;
+ }
+ }
+
+ private String detectTika(TikaInputStream tis) {
+ try {
+ return TIKA.detect(tis);
+ } catch (IOException e) {
+ return DETECT_EXCEPTION;
+ }
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
new file mode 100644
index 0000000..7e70d44
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/TikaEvalCLI.java
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.ParseException;
+import org.apache.tika.batch.fs.FSBatchProcessCLI;
+import org.apache.tika.eval.app.reports.ResultsReporter;
+import org.h2.tools.Console;
+
+public class TikaEvalCLI {
+ static final String[] tools = {"Profile", "FileProfile",
+ "Compare", "Report", "StartDB"};
+
+ private static String specifyTools() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("Must specify one of the following tools in the first parameter:\n");
+ for (String s : tools) {
+ sb.append(s+"\n");
+ }
+ return sb.toString();
+
+ }
+
+ private void execute(String[] args) throws Exception {
+ String tool = args[0];
+ String[] subsetArgs = new String[args.length-1];
+ System.arraycopy(args, 1, subsetArgs, 0, args.length - 1);
+ if (tool.equals("Report")) {
+ handleReport(subsetArgs);
+ } else if (tool.equals("Compare")) {
+ handleCompare(subsetArgs);
+ } else if (tool.equals("Profile")) {
+ handleProfile(subsetArgs);
+ } else if (tool.equals("StartDB")) {
+ handleStartDB(subsetArgs);
+ } else if (tool.equals("FileProfile")){
+ handleProfileFiles(subsetArgs);
+ } else {
+ System.out.println(specifyTools());
+ }
+ }
+
+ private void handleProfileFiles(String[] subsetArgs) throws Exception {
+ List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
+
+ boolean containsBC = false;
+ String inputDir = null;
+ //confirm there's a batch-config file
+ for (int i = 0; i < argList.size(); i++) {
+ String arg = argList.get(i);
+ if (arg.equals("-bc")) {
+ containsBC = true;
+ }
+ }
+
+ Path tmpBCConfig = null;
+ try {
+ tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
+ if (! containsBC) {
+ try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-file-profiler-config.xml")) {
+ Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
+ }
+ argList.add("-bc");
+ argList.add(tmpBCConfig.toAbsolutePath().toString());
+ }
+
+ String[] updatedArgs = argList.toArray(new String[argList.size()]);
+ DefaultParser defaultCLIParser = new DefaultParser();
+ try {
+ CommandLine commandLine = defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs);
+ if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
+ System.out.println("Please specify either the default -db or the full -jdbc, not both");
+ ExtractProfiler.USAGE();
+ return;
+ }
+ } catch (ParseException e) {
+ System.out.println(e.getMessage()+"\n");
+ FileProfiler.USAGE();
+ return;
+ }
+
+ FSBatchProcessCLI.main(updatedArgs);
+ } finally {
+ if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
+ Files.delete(tmpBCConfig);
+ }
+ }
+ }
+
+ private void handleStartDB(String[] args) throws SQLException {
+ List<String> argList = new ArrayList<>();
+ argList.add("-web");
+ Console.main(argList.toArray(new String[0]));
+ while(true) {
+ try {
+ Thread.sleep(1000);
+ } catch (InterruptedException e){
+ break;
+ }
+ }
+ }
+
+ private void handleProfile(String[] subsetArgs) throws Exception {
+ List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
+
+ boolean containsBC = false;
+ String inputDir = null;
+ String extracts = null;
+ String alterExtract = null;
+ //confirm there's a batch-config file
+ for (int i = 0; i < argList.size(); i++) {
+ String arg = argList.get(i);
+ if (arg.equals("-bc")) {
+ containsBC = true;
+ } else if (arg.equals("-inputDir")) {
+ if (i+1 >= argList.size()) {
+ System.err.println("Must specify directory after -inputDir");
+ ExtractProfiler.USAGE();
+ return;
+ }
+ inputDir = argList.get(i+1);
+ i++;
+ } else if (arg.equals("-extracts")) {
+ if (i+1 >= argList.size()) {
+ System.err.println("Must specify directory after -extracts");
+ ExtractProfiler.USAGE();
+ return;
+ }
+ extracts = argList.get(i+1);
+ i++;
+ } else if (arg.equals("-alterExtract")) {
+ if (i+1 >= argList.size()) {
+ System.err.println("Must specify type 'as_is', 'first_only' or " +
+ "'concatenate_content' after -alterExtract");
+ ExtractComparer.USAGE();
+ return;
+ }
+ alterExtract = argList.get(i+1);
+ i++;
+ }
+ }
+
+ if (alterExtract != null && !alterExtract.equals("as_is") &&
+ !alterExtract.equals("concatenate_content") &&
+ !alterExtract.equals("first_only")) {
+ System.out.println("Sorry, I don't understand:"+alterExtract+
+ ". The values must be one of: as_is, first_only, concatenate_content");
+ ExtractProfiler.USAGE();
+ return;
+ }
+
+ //need to specify each in this commandline
+ //if only extracts is passed to tika-batch,
+ //the crawler will see no inputDir and start crawling "input".
+ //this allows the user to specify either extracts or inputDir
+ if (extracts == null && inputDir != null) {
+ argList.add("-extracts");
+ argList.add(inputDir);
+ } else if (inputDir == null && extracts != null) {
+ argList.add("-inputDir");
+ argList.add(extracts);
+ }
+
+ Path tmpBCConfig = null;
+ try {
+ tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
+ if (! containsBC) {
+ try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-profiler-config.xml")) {
+ Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
+ }
+ argList.add("-bc");
+ argList.add(tmpBCConfig.toAbsolutePath().toString());
+ }
+
+ String[] updatedArgs = argList.toArray(new String[0]);
+ DefaultParser defaultCLIParser = new DefaultParser();
+ try {
+ CommandLine commandLine = defaultCLIParser.parse(ExtractProfiler.OPTIONS, updatedArgs);
+ if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
+ System.out.println("Please specify either the default -db or the full -jdbc, not both");
+ ExtractProfiler.USAGE();
+ return;
+ }
+ } catch (ParseException e) {
+ System.out.println(e.getMessage()+"\n");
+ ExtractProfiler.USAGE();
+ return;
+ }
+
+ FSBatchProcessCLI.main(updatedArgs);
+ } finally {
+ if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
+ Files.delete(tmpBCConfig);
+ }
+ }
+ }
+
+ private void handleCompare(String[] subsetArgs) throws Exception{
+ List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
+
+ boolean containsBC = false;
+ String inputDir = null;
+ String extractsA = null;
+ String alterExtract = null;
+ //confirm there's a batch-config file
+ for (int i = 0; i < argList.size(); i++) {
+ String arg = argList.get(i);
+ if (arg.equals("-bc")) {
+ containsBC = true;
+ } else if (arg.equals("-inputDir")) {
+ if (i+1 >= argList.size()) {
+ System.err.println("Must specify directory after -inputDir");
+ ExtractComparer.USAGE();
+ return;
+ }
+ inputDir = argList.get(i+1);
+ i++;
+ } else if (arg.equals("-extractsA")) {
+ if (i+1 >= argList.size()) {
+ System.err.println("Must specify directory after -extractsA");
+ ExtractComparer.USAGE();
+ return;
+ }
+ extractsA = argList.get(i+1);
+ i++;
+ } else if (arg.equals("-alterExtract")) {
+ if (i+1 >= argList.size()) {
+ System.err.println("Must specify type 'as_is', 'first_only' or " +
+ "'concatenate_content' after -alterExtract");
+ ExtractComparer.USAGE();
+ return;
+ }
+ alterExtract = argList.get(i+1);
+ i++;
+ }
+ }
+ if (alterExtract != null && !alterExtract.equals("as_is") &&
+ !alterExtract.equals("concatenate_content") &&
+ !alterExtract.equals("first_only")) {
+ System.out.println("Sorry, I don't understand:"+alterExtract+
+ ". The values must be one of: as_is, first_only, concatenate_content");
+ ExtractComparer.USAGE();
+ return;
+ }
+
+ //need to specify each in the commandline that goes into tika-batch
+ //if only extracts is passed to tika-batch,
+ //the crawler will see no inputDir and start crawling "input".
+ //if the user doesn't specify inputDir, crawl extractsA
+ if (inputDir == null && extractsA != null) {
+ argList.add("-inputDir");
+ argList.add(extractsA);
+ }
+
+ Path tmpBCConfig = null;
+ try {
+ tmpBCConfig = Files.createTempFile("tika-eval", ".xml");
+ if (! containsBC) {
+ try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-comparison-config.xml")) {
+ Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
+ }
+ argList.add("-bc");
+ argList.add(tmpBCConfig.toAbsolutePath().toString());
+
+ }
+ String[] updatedArgs = argList.toArray(new String[0]);
+ DefaultParser defaultCLIParser = new DefaultParser();
+ try {
+ CommandLine commandLine = defaultCLIParser.parse(ExtractComparer.OPTIONS, updatedArgs);
+ if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
+ System.out.println("Please specify either the default -db or the full -jdbc, not both");
+ ExtractComparer.USAGE();
+ return;
+ }
+ } catch (ParseException e) {
+ System.out.println(e.getMessage()+"\n");
+ ExtractComparer.USAGE();
+ return;
+ }
+
+ FSBatchProcessCLI.main(updatedArgs);
+ } finally {
+ if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
+ Files.delete(tmpBCConfig);
+ }
+ }
+ }
+
+ private void handleReport(String[] subsetArgs) throws Exception {
+ ResultsReporter.main(subsetArgs);
+ }
+
+ public static void main(String[] args) throws Exception {
+ TikaEvalCLI cli = new TikaEvalCLI();
+ if (args.length == 0) {
+ System.err.println(specifyTools());
+ return;
+ }
+ cli.execute(args);
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/XMLErrorLogUpdater.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/XMLErrorLogUpdater.java
new file mode 100644
index 0000000..9a8862a
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/XMLErrorLogUpdater.java
@@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app;
+
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.log4j.Level;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.H2Util;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.XMLLogMsgHandler;
+import org.apache.tika.eval.app.io.XMLLogReader;
+import org.apache.tika.eval.app.reports.ResultsReporter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a very task specific class that reads a log file and updates
+ * the "comparisons" table. It should not be run in a multithreaded environment.
+ */
+public class XMLErrorLogUpdater {
+ private static final Logger LOG = LoggerFactory.getLogger(ResultsReporter.class);
+
+ private Statement statement;
+
+ public static void main(String[] args) throws Exception {
+ XMLErrorLogUpdater writer = new XMLErrorLogUpdater();
+ Path xmlLogFileA = Paths.get(args[0]);
+ Path xmlLogFileB = Paths.get(args[1]);
+ Path db = Paths.get(args[2]);
+ JDBCUtil dbUtil = new H2Util(db);
+ Connection connection = dbUtil.getConnection();
+ writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, xmlLogFileA);
+ writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, xmlLogFileB);
+ connection.commit();
+ }
+
+ public void update(Connection connection, TableInfo tableInfo, Path xmlLogFile) throws Exception {
+ statement = connection.createStatement();
+ XMLLogReader reader = new XMLLogReader();
+ try (InputStream is = Files.newInputStream(xmlLogFile)) {
+ reader.read(is, new ErrorMsgUpdater(tableInfo.getName()));
+ } catch (IOException e) {
+ throw new RuntimeException("Problem reading: "+xmlLogFile.toAbsolutePath().toString());
+ } finally {
+ try {
+ connection.commit();
+ statement.close();
+ } catch (SQLException e) {
+ throw new RuntimeException("Failed to close db connection!", e);
+ }
+ }
+ }
+
+ private class ErrorMsgUpdater implements XMLLogMsgHandler {
+ private final String errorTablename;
+
+ private ErrorMsgUpdater(String errorTablename) {
+ this.errorTablename = errorTablename;
+ }
+
+ @Override
+ public void handleMsg(Level level, String xml) throws SQLException, IOException {
+ if (! level.equals(Level.ERROR)) {
+ return;
+ }
+ XMLStreamReader reader = null;
+ try {
+ reader = XMLInputFactory.newInstance().createXMLStreamReader(new StringReader(xml));
+ } catch (XMLStreamException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ String type = null;
+ String resourceId = null;
+ try {
+ while (reader.hasNext() && type == null && resourceId == null) {
+ reader.next();
+ switch (reader.getEventType()) {
+ case XMLStreamConstants.START_ELEMENT:
+ if ("timed_out".equals(reader.getLocalName())) {
+ resourceId = reader.getAttributeValue("", "resourceId");
+ update(errorTablename, resourceId,
+ AbstractProfiler.PARSE_ERROR_TYPE.TIMEOUT);
+
+ } else if ("oom".equals(reader.getLocalName())) {
+ resourceId = reader.getAttributeValue("", "resourceId");
+ update(errorTablename, resourceId, AbstractProfiler.PARSE_ERROR_TYPE.OOM);
+ }
+ break;
+ }
+ }
+ reader.close();
+ } catch (XMLStreamException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ }
+
+ private void update(String errorTableName,
+ String filePath, AbstractProfiler.PARSE_ERROR_TYPE type) throws SQLException {
+ int containerId = getContainerId(filePath);
+ String sql = "SELECT count(1) from "+errorTableName +
+ " where "+ Cols.CONTAINER_ID +
+ " = "+containerId + " or "+
+ Cols.FILE_PATH + "='"+filePath+"'";
+ ResultSet rs = statement.executeQuery(sql);
+
+ //now try to figure out if that file already exists
+ //in parse errors
+ int hitCount = 0;
+ while (rs.next()) {
+ hitCount = rs.getInt(1);
+ }
+
+ //if it does, update all records matching that path or container id
+ if (hitCount > 0) {
+ sql = "UPDATE " + errorTableName +
+ " SET " + Cols.PARSE_ERROR_ID +
+ " = " + type.ordinal() + ","+
+ Cols.FILE_PATH + "='" +filePath+"'"+
+ " where "+Cols.CONTAINER_ID +
+ "="+containerId + " or "+
+ Cols.FILE_PATH + "='"+filePath+"'";;
+
+ } else {
+ //if not and container id > -1
+ //insert full record
+ if (containerId > -1) {
+ sql = "INSERT INTO " + errorTableName +
+ " ("+Cols.CONTAINER_ID+","+Cols.FILE_PATH +","+Cols.PARSE_ERROR_ID +")"+
+ " values (" + containerId + ", '" + filePath + "'," +
+ type.ordinal() + ");";
+ } else {
+ //if container id == -1, insert only file path and parse error type id
+ sql = "INSERT INTO " + errorTableName +
+ " ("+Cols.FILE_PATH.name()+","+Cols.PARSE_ERROR_ID +")"+
+ "values ('" + filePath + "'," +
+ type.ordinal() + ");";
+ }
+
+ }
+ int updated = statement.executeUpdate(sql);
+ if (updated == 0) {
+ //TODO: log
+ LOG.warn("made no updates in xmlerrorlogupdater!");
+ } else if (updated > 1) {
+ LOG.warn("made too many updates");
+ }
+ }
+
+ private int getContainerId(String resourceId) throws SQLException {
+ int containerId = -1;
+ String sql = "SELECT " + Cols.CONTAINER_ID.name() +
+ " from " + ExtractProfiler.CONTAINER_TABLE.getName()+
+ " where " + Cols.FILE_PATH +
+ " ='"+resourceId+"'";
+ ResultSet rs = statement.executeQuery(sql);
+ int resultCount = 0;
+ while (rs.next()) {
+ containerId = rs.getInt(1);
+ resultCount++;
+ }
+ rs.close();
+
+ if (resultCount == 0) {
+ LOG.warn("Should have found a container for: {}", resourceId);
+ } else if (resultCount > 1) {
+ LOG.error("Records ids should be unique: {}", resourceId);
+ }
+/*
+ if (containerId < 0) {
+ System.err.println("CONTAINER ID < 0!!!");
+ sql = "SELECT MAX("+ Cols.CONTAINER_ID.name() +
+ ") from "+ExtractProfiler.CONTAINER_TABLE.getName();
+ rs = statement.executeQuery(sql);
+ while (rs.next()) {
+ containerId = rs.getInt(1);
+ }
+ rs.close();
+ if (containerId < 0) {
+ //log and abort
+ //return -1?
+ } else {
+ containerId++;
+ }
+
+ }*/
+ return containerId;
+ }
+
+
+ }
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/DBConsumersManager.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/DBConsumersManager.java
new file mode 100644
index 0000000..7819893
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/DBConsumersManager.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.batch.ConsumersManager;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.eval.app.AbstractProfiler;
+import org.apache.tika.eval.app.XMLErrorLogUpdater;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.eval.app.db.TableInfo;
+
+
+public class DBConsumersManager extends ConsumersManager {
+
+ private final Connection conn;
+ private final MimeBuffer mimeBuffer;
+ private final List<LogTablePair> errorLogs = new ArrayList<>();
+
+ public DBConsumersManager(JDBCUtil dbUtil, MimeBuffer mimeBuffer, List<FileResourceConsumer> consumers)
+ throws SQLException {
+ super(consumers);
+ this.conn = dbUtil.getConnection();
+ this.mimeBuffer = mimeBuffer;
+ }
+
+
+ @Override
+ public void shutdown() {
+
+ for (FileResourceConsumer consumer : getConsumers()) {
+ if (consumer instanceof AbstractProfiler) {
+ try{
+ ((AbstractProfiler)consumer).closeWriter();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ try {
+ mimeBuffer.close();
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+
+ //MUST HAPPEN AFTER consumers have closed and
+ //committed container information!!!
+ XMLErrorLogUpdater up = new XMLErrorLogUpdater();
+ for (LogTablePair p : errorLogs) {
+ try {
+ up.update(conn, p.tableInfo, p.log);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+
+ try {
+ conn.commit();
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ try {
+ conn.close();
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public void addErrorLogTablePair(Path log, TableInfo tableInfo) {
+ LogTablePair p = new LogTablePair();
+ p.log = log;
+ p.tableInfo = tableInfo;
+ errorLogs.add(p);
+ }
+
+ class LogTablePair {
+ Path log;
+ TableInfo tableInfo;
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumerBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumerBuilder.java
new file mode 100644
index 0000000..9db92a7
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumerBuilder.java
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app.batch;
+
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.app.AbstractProfiler;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.eval.app.io.DBWriter;
+import org.apache.tika.eval.app.io.ExtractReader;
+import org.apache.tika.eval.app.io.ExtractReaderException;
+import org.apache.tika.eval.app.io.IDBWriter;
+import org.apache.tika.util.PropsUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public abstract class EvalConsumerBuilder {
+
+ private static final Logger LOG = LoggerFactory.getLogger(EvalConsumerBuilder.class);
+
+ protected ArrayBlockingQueue<FileResource> queue;
+ Map<String, String> localAttrs;
+ JDBCUtil dbUtil;
+ private MimeBuffer mimeBuffer;
+ AtomicInteger initialized = new AtomicInteger(0);
+
+ public MimeBuffer init(ArrayBlockingQueue<FileResource> queue, Map<String, String> localAttrs,
+ JDBCUtil dbUtil, boolean forceDrop) throws IOException, SQLException {
+ if (initialized.getAndIncrement() > 0) {
+ throw new RuntimeException("Can only init a consumer builder once!");
+ }
+ this.queue = queue;
+ this.localAttrs = localAttrs;
+ this.dbUtil = dbUtil;
+ //the order of the following is critical
+ //step 1. update the table names with prefixes
+ updateTableInfosWithPrefixes(localAttrs);
+
+ JDBCUtil.CREATE_TABLE createRegularTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS :
+ JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS;
+
+ JDBCUtil.CREATE_TABLE createRefTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS :
+ JDBCUtil.CREATE_TABLE.SKIP_IF_EXISTS;
+
+ //step 2. create the tables
+ dbUtil.createTables(getNonRefTableInfos(), createRegularTable);
+ dbUtil.createTables(getRefTableInfos(), createRefTable);
+
+ //step 3. create mime buffer
+ this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), getMimeTable(),
+ TikaConfig.getDefaultConfig());
+
+ //step 4. populate the reference tables
+ populateRefTables();
+
+ return mimeBuffer;
+ }
+
+ public abstract FileResourceConsumer build() throws IOException, SQLException;
+
+ protected abstract void updateTableInfosWithPrefixes(Map<String, String> attrs);
+
+ /**
+ *
+ * @return only the ref tables
+ */
+ protected abstract List<TableInfo> getRefTableInfos();
+
+ /**
+ *
+ * @return the main tables, not including the ref tables
+ */
+ protected abstract List<TableInfo> getNonRefTableInfos();
+
+ protected abstract TableInfo getMimeTable();
+
+ protected abstract void addErrorLogTablePairs(DBConsumersManager manager);
+
+ public void populateRefTables() throws IOException, SQLException {
+ boolean refTablesPopulated = true;
+ try{
+ Connection connection = dbUtil.getConnection();
+ for (TableInfo tableInfo : getRefTableInfos()) {
+ int rows = 0;
+ try (ResultSet rs = connection.createStatement().executeQuery("select * from "+
+ tableInfo.getName())) {
+ while (rs.next()) {
+ rows++;
+ }
+ }
+ if (rows == 0) {
+ refTablesPopulated = false;
+ break;
+ }
+
+ }
+ } catch (SQLException e) {
+ //swallow
+ }
+ if (refTablesPopulated) {
+ LOG.info("ref tables are already populated");
+ return;
+ }
+
+ IDBWriter writer = getDBWriter(getRefTableInfos());
+ Map<Cols, String> m = new HashMap<>();
+ for (AbstractProfiler.PARSE_ERROR_TYPE t : AbstractProfiler.PARSE_ERROR_TYPE.values()) {
+ m.clear();
+ m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
+ m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
+ writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
+ }
+
+ for (AbstractProfiler.EXCEPTION_TYPE t : AbstractProfiler.EXCEPTION_TYPE.values()) {
+ m.clear();
+ m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
+ m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
+ writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
+ }
+
+ for (ExtractReaderException.TYPE t :
+ ExtractReaderException.TYPE.values()) {
+ m.clear();
+ m.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(t.ordinal()));
+ m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
+ writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, m);
+ }
+ writer.close();
+ }
+
+ protected IDBWriter getDBWriter(List<TableInfo> tableInfos) throws IOException, SQLException {
+ Connection conn = dbUtil.getConnection();
+ return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer);
+ }
+
+ ExtractReader.ALTER_METADATA_LIST getAlterMetadata(Map<String, String> localAttrs) {
+
+ String alterExtractString = localAttrs.get("alterExtract");
+ ExtractReader.ALTER_METADATA_LIST alterExtractList;
+ if (alterExtractString == null || alterExtractString.equalsIgnoreCase("as_is")) {
+ alterExtractList = ExtractReader.ALTER_METADATA_LIST.AS_IS;
+ } else if (alterExtractString.equalsIgnoreCase("first_only")) {
+ alterExtractList = ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY;
+ } else if (alterExtractString.equalsIgnoreCase("concatenate_content")) {
+ alterExtractList = ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST;
+ } else {
+ throw new RuntimeException("options for alterExtract: as_is, first_only, concatenate_content." +
+ " I don't understand:" + alterExtractString);
+ }
+ return alterExtractList;
+ }
+
+ protected ExtractReader buildExtractReader(Map<String, String> localAttrs) {
+ long minExtractLength = PropsUtil.getLong(localAttrs.get("minExtractLength"), -1L);
+ long maxExtractLength = PropsUtil.getLong(localAttrs.get("maxExtractLength"), -1L);
+
+ ExtractReader.ALTER_METADATA_LIST alterExtractList = getAlterMetadata(localAttrs);
+ return new ExtractReader(alterExtractList, minExtractLength, maxExtractLength);
+ }
+
+ FileResourceConsumer parameterizeProfiler(AbstractProfiler abstractProfiler) {
+
+ int maxContentLength = PropsUtil.getInt(localAttrs.get("maxContentLength"), -2);
+ if (maxContentLength > -2) {
+ abstractProfiler.setMaxContentLength(maxContentLength);
+ }
+
+ int maxContentLengthForLangId = PropsUtil.getInt(localAttrs.get("maxContentLengthForLangId"), -2);
+ if (maxContentLengthForLangId > -2) {
+ abstractProfiler.setMaxContentLengthForLangId(maxContentLengthForLangId);
+ }
+
+ int maxTokens = PropsUtil.getInt(localAttrs.get("maxTokens"), -2);
+ if (maxTokens > -2) {
+ abstractProfiler.setMaxTokens(maxTokens);
+ }
+
+
+ return abstractProfiler;
+ }
+
+
+/*
+ public abstract Map<String, String> getIndexInfo();
+
+ class ValueComparator implements Comparator<String> {
+
+ Map<String, ColInfo> map;
+
+ public ValueComparator(Map<String, ColInfo> base) {
+ this.map = base;
+ }
+
+ public int compare(String a, String b) {
+ Integer aVal = map.get(a).getDBColOffset();
+ Integer bVal = map.get(b).getDBColOffset();
+ if (aVal == null || bVal == null) {
+ throw new IllegalArgumentException("Column offset must be specified!");
+ }
+ if (aVal == bVal && ! map.get(a).equals(map.get(b))) {
+ throw new IllegalArgumentException("Column offsets must be unique: " + a + " and " + b + " both have: "+aVal);
+ }
+ if (aVal < bVal) {
+ return -1;
+ } else {
+ return 1;
+ }
+ }
+ }
+*/
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumersBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumersBuilder.java
new file mode 100644
index 0000000..32c4bf4
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/EvalConsumersBuilder.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.SQLException;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.tika.batch.ConsumersManager;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.batch.builders.AbstractConsumersBuilder;
+import org.apache.tika.batch.builders.BatchProcessBuilder;
+import org.apache.tika.eval.app.AbstractProfiler;
+import org.apache.tika.eval.app.db.H2Util;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.util.ClassLoaderUtil;
+import org.apache.tika.util.PropsUtil;
+import org.apache.tika.util.XMLDOMUtil;
+import org.w3c.dom.Node;
+
+public class EvalConsumersBuilder extends AbstractConsumersBuilder {
+
+ @Override
+ public ConsumersManager build(Node node, Map<String, String> runtimeAttributes,
+ ArrayBlockingQueue<FileResource> queue) {
+
+ List<FileResourceConsumer> consumers = new LinkedList<>();
+ int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
+
+ Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
+
+
+ Path db = getPath(localAttrs, "db");
+ String jdbcConnectionString = localAttrs.get("jdbc");
+
+
+ Path commonTokens = getPath(localAttrs, "commonTokens");
+ String defaultLangCode = localAttrs.get("defaultLangCode");
+ if (defaultLangCode == null) {
+ defaultLangCode = "";
+ }
+ //can be null, in which case will load from memory
+ try {
+ AbstractProfiler.loadCommonTokens(commonTokens, defaultLangCode);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ JDBCUtil jdbcUtil = null;
+ if (db != null) {
+ jdbcUtil = new H2Util(db);
+ } else if (jdbcConnectionString != null) {
+ jdbcUtil = new JDBCUtil(jdbcConnectionString, localAttrs.get("jdbcDriver"));
+ } else {
+ throw new RuntimeException("Must specify: -db or -jdbc");
+ }
+ EvalConsumerBuilder consumerBuilder = ClassLoaderUtil.buildClass(EvalConsumerBuilder.class,
+ PropsUtil.getString(localAttrs.get("consumerBuilderClass"), null));
+ if (consumerBuilder == null) {
+ throw new RuntimeException("Must specify consumerBuilderClass in config file");
+ }
+
+ boolean forceDrop = PropsUtil.getBoolean(localAttrs.get("drop"), false);
+ MimeBuffer mimeBuffer = null;
+ try {
+ mimeBuffer = consumerBuilder.init(queue, localAttrs, jdbcUtil, forceDrop);
+ } catch (IOException | SQLException e) {
+ throw new RuntimeException(e);
+ }
+
+ for (int i = 0; i < numConsumers; i++) {
+ try {
+ consumers.add(consumerBuilder.build());
+ } catch (IOException | SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ DBConsumersManager manager;
+ try {
+ manager = new DBConsumersManager(jdbcUtil, mimeBuffer, consumers);
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ consumerBuilder.addErrorLogTablePairs(manager);
+
+ return manager;
+ }
+
+ private Path getNonNullPath(Map<String, String> attrs, String key) {
+ Path p = getPath(attrs, key);
+ if (p == null) {
+ throw new RuntimeException("Must specify a file for this attribute: "+key);
+ }
+ return p;
+ }
+
+
+ protected Path getPath(Map<String, String> attrs, String key) {
+ String filePath = attrs.get(key);
+ if (filePath == null) {
+ return null;
+ }
+ return Paths.get(filePath);
+ }
+
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java
new file mode 100644
index 0000000..b0252a9
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractComparerBuilder.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.eval.app.AbstractProfiler;
+import org.apache.tika.eval.app.ExtractComparer;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.util.PropsUtil;
+
+public class ExtractComparerBuilder extends EvalConsumerBuilder {
+ public final static String TABLE_PREFIX_A_KEY = "tablePrefixA";
+ public final static String TABLE_PREFIX_B_KEY = "tablePrefixB";
+
+ private final List<TableInfo> tableInfosA;
+ private final List<TableInfo> tableInfosB;
+ private final List<TableInfo> tableInfosAandB;
+ private final List<TableInfo> refTableInfos;
+
+ public ExtractComparerBuilder() {
+ List<TableInfo> tableInfosA = new ArrayList<>();
+ List<TableInfo> tableInfosB = new ArrayList<>();
+ List<TableInfo> tableInfosAandB = new ArrayList<>();
+ tableInfosA.add(ExtractComparer.PROFILES_A);
+ tableInfosA.add(ExtractComparer.EXCEPTION_TABLE_A);
+ tableInfosA.add(ExtractComparer.TAGS_TABLE_A);
+ tableInfosA.add(ExtractComparer.CONTENTS_TABLE_A);
+ tableInfosA.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
+ tableInfosA.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A);
+
+ tableInfosB.add(ExtractComparer.PROFILES_B);
+ tableInfosB.add(ExtractComparer.EXCEPTION_TABLE_B);
+ tableInfosB.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
+ tableInfosB.add(ExtractComparer.TAGS_TABLE_B);
+ tableInfosB.add(ExtractComparer.CONTENTS_TABLE_B);
+ tableInfosB.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B);
+
+ tableInfosAandB.add(ExtractComparer.COMPARISON_CONTAINERS);
+ tableInfosAandB.add(ExtractComparer.CONTENT_COMPARISONS);
+ tableInfosAandB.add(AbstractProfiler.MIME_TABLE);
+
+ List<TableInfo> refTableInfos = new ArrayList<>();
+ refTableInfos.add(ExtractComparer.REF_PAIR_NAMES);
+ refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
+ refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
+ refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
+
+ this.tableInfosA = Collections.unmodifiableList(tableInfosA);
+ this.tableInfosB = Collections.unmodifiableList(tableInfosB);
+ this.tableInfosAandB = Collections.unmodifiableList(tableInfosAandB);
+ this.refTableInfos = Collections.unmodifiableList(refTableInfos);
+ }
+
+ @Override
+ public FileResourceConsumer build() throws IOException, SQLException {
+ Path extractsA = PropsUtil.getPath(localAttrs.get("extractsA"), null);
+ if (extractsA == null) {
+ throw new RuntimeException("Must specify \"extractsA\" -- directory for 'A' extracts");
+ }
+ Path extractsB = PropsUtil.getPath(localAttrs.get("extractsB"), null);
+ if (extractsB == null) {
+ throw new RuntimeException("Must specify \"extractsB\" -- directory for 'B' extracts");
+ }
+
+ Path inputRootDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
+
+ if (inputRootDir == null) {
+ //this is for the sake of the crawler
+ throw new RuntimeException("Must specify an -inputDir");
+ }
+
+ return parameterizeProfiler(new ExtractComparer(queue, inputRootDir, extractsA, extractsB,
+ buildExtractReader(localAttrs),
+ getDBWriter(getNonRefTableInfos())));
+ }
+
+
+ @Override
+ protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
+ String tablePrefixA = localAttrs.get(TABLE_PREFIX_A_KEY);
+
+ String tablePrefixB = localAttrs.get(TABLE_PREFIX_B_KEY);
+
+ tablePrefixA = (tablePrefixA == null || tablePrefixA.endsWith("_")) ? tablePrefixA : tablePrefixA+"_";
+ tablePrefixB = (tablePrefixB == null || tablePrefixB.endsWith("_")) ? tablePrefixB : tablePrefixB+"_";
+
+ if (tablePrefixA != null) {
+ for (TableInfo tableInfo : tableInfosA) {
+ tableInfo.setNamePrefix(tablePrefixA);
+ }
+ }
+
+ if (tablePrefixB != null) {
+ for (TableInfo tableInfo : tableInfosB) {
+ tableInfo.setNamePrefix(tablePrefixB);
+ }
+ }
+
+ if (tablePrefixA != null || tablePrefixB != null) {
+ String aAndB = (tablePrefixA == null) ? "" : tablePrefixA;
+ aAndB = (tablePrefixB == null) ? aAndB : aAndB+tablePrefixB;
+ for (TableInfo tableInfo : tableInfosAandB) {
+ tableInfo.setNamePrefix(aAndB);
+ }
+ }
+ }
+
+ @Override
+ protected List<TableInfo> getRefTableInfos() {
+ return refTableInfos;
+ }
+
+ @Override
+ protected List<TableInfo> getNonRefTableInfos() {
+ List<TableInfo> allNonRefTables = new ArrayList<>();
+ allNonRefTables.addAll(tableInfosA);
+ allNonRefTables.addAll(tableInfosB);
+ allNonRefTables.addAll(tableInfosAandB);
+ return Collections.unmodifiableList(allNonRefTables);
+ }
+
+ @Override
+ protected TableInfo getMimeTable() {
+ return AbstractProfiler.MIME_TABLE;
+ }
+
+ @Override
+ protected void addErrorLogTablePairs(DBConsumersManager manager) {
+ Path errorLogA = PropsUtil.getPath(localAttrs.get("errorLogFileA"), null);
+ if (errorLogA == null) {
+ return;
+ }
+ manager.addErrorLogTablePair(errorLogA, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
+ Path errorLogB = PropsUtil.getPath(localAttrs.get("errorLogFileB"), null);
+ if (errorLogB == null) {
+ return;
+ }
+ manager.addErrorLogTablePair(errorLogB, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
+
+ }
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
new file mode 100644
index 0000000..69c34e1
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/ExtractProfilerBuilder.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.eval.app.AbstractProfiler;
+import org.apache.tika.eval.app.ExtractProfiler;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.util.PropsUtil;
+
+
+public class ExtractProfilerBuilder extends EvalConsumerBuilder {
+
+ public final static String TABLE_PREFIX_KEY = "tablePrefix";
+
+ private final List<TableInfo> tableInfos;
+ private final List<TableInfo> refTableInfos;
+ public ExtractProfilerBuilder() {
+ List<TableInfo> tableInfos = new ArrayList();
+ tableInfos.add(AbstractProfiler.MIME_TABLE);
+ tableInfos.add(ExtractProfiler.CONTAINER_TABLE);
+ tableInfos.add(ExtractProfiler.PROFILE_TABLE);
+ tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+ tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
+ tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
+ tableInfos.add(ExtractProfiler.TAGS_TABLE);
+ tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
+ this.tableInfos = Collections.unmodifiableList(tableInfos);
+
+ List<TableInfo> refTableInfos = new ArrayList<>();
+ refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
+ refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
+ refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
+ this.refTableInfos = Collections.unmodifiableList(refTableInfos);
+ }
+
+ @Override
+ public FileResourceConsumer build() throws IOException, SQLException {
+ Path extracts = PropsUtil.getPath(localAttrs.get("extracts"), null);
+ if (extracts == null) {
+ throw new RuntimeException("Must specify \"extracts\" -- directory to crawl");
+ }
+ if (!Files.isDirectory(extracts)) {
+ throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " +
+ extracts.toAbsolutePath());
+ }
+
+ Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
+
+ //we _could_ set this to extracts (if not null)
+ //here, but the Crawler defaults to "input" if nothing is passed
+ //so this won't work
+ if (inputDir == null) {
+ throw new RuntimeException("Must specify -inputDir");
+ }
+ if (extracts == null && inputDir != null) {
+ extracts = inputDir;
+ }
+ return parameterizeProfiler(new ExtractProfiler(queue, inputDir, extracts,
+ buildExtractReader(localAttrs),
+ getDBWriter(tableInfos)));
+ }
+
+
+ @Override
+ protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
+ String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
+ if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
+ for (TableInfo tableInfo : tableInfos) {
+ tableInfo.setNamePrefix(tableNamePrefix);
+ }
+ }
+ }
+
+
+ @Override
+ protected List<TableInfo> getRefTableInfos() {
+ return refTableInfos;
+ }
+
+ @Override
+ protected List<TableInfo> getNonRefTableInfos() {
+ return tableInfos;
+ }
+
+ @Override
+ protected TableInfo getMimeTable() {
+ return AbstractProfiler.MIME_TABLE;
+ }
+
+ @Override
+ protected void addErrorLogTablePairs(DBConsumersManager manager) {
+ Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
+ if (errorLog == null) {
+ return;
+ }
+ manager.addErrorLogTablePair(errorLog, ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java
new file mode 100644
index 0000000..29412d3
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/batch/FileProfilerBuilder.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.batch;
+
+import org.apache.tika.batch.FileResourceConsumer;
+
+import org.apache.tika.eval.app.ExtractProfiler;
+import org.apache.tika.eval.app.FileProfiler;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.apache.tika.util.PropsUtil;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+
+public class FileProfilerBuilder extends EvalConsumerBuilder {
+
+ public final static String TABLE_PREFIX_KEY = "tablePrefix";
+
+ private final List<TableInfo> tableInfos;
+
+ public FileProfilerBuilder() {
+ List<TableInfo> tableInfos = new ArrayList();
+ tableInfos.add(FileProfiler.FILE_MIME_TABLE);
+ tableInfos.add(FileProfiler.FILE_PROFILES);
+ this.tableInfos = Collections.unmodifiableList(tableInfos);
+
+ }
+
+ @Override
+ public FileResourceConsumer build() throws IOException, SQLException {
+
+ Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
+
+ //we _could_ set this to extracts (if not null)
+ //here, but the Crawler defaults to "input" if nothing is passed
+ //so this won't work
+ if (inputDir == null) {
+ throw new RuntimeException("Must specify -inputDir");
+ }
+ return parameterizeProfiler(new FileProfiler(queue, inputDir,
+ getDBWriter(tableInfos)));
+ }
+
+
+ @Override
+ protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
+ String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
+ if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
+ for (TableInfo tableInfo : tableInfos) {
+ tableInfo.setNamePrefix(tableNamePrefix);
+ }
+ }
+ }
+
+ @Override
+ protected List<TableInfo> getRefTableInfos() {
+ return Collections.EMPTY_LIST;
+ }
+
+ @Override
+ protected List<TableInfo> getNonRefTableInfos() {
+ return tableInfos;
+ }
+
+ @Override
+ protected TableInfo getMimeTable() {
+ return FileProfiler.FILE_MIME_TABLE;
+ }
+
+ @Override
+ protected void addErrorLogTablePairs(DBConsumersManager manager) {
+ Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
+ if (errorLog == null) {
+ return;
+ }
+ manager.addErrorLogTablePair(errorLog, ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/AbstractDBBuffer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/AbstractDBBuffer.java
new file mode 100644
index 0000000..03e122c
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/AbstractDBBuffer.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+
+/**
+ * Abstract buffer for map of values and unique ids.
+ * <p>
+ * Use this for fast in memory lookups of smallish sets of values.
+ *
+ */
+abstract class AbstractDBBuffer {
+
+ private final Map<String, Integer> m = new HashMap<>();
+ private final ReentrantReadWriteLock rwl = new ReentrantReadWriteLock();
+ private final Lock r = rwl.readLock();
+ private final Lock w = rwl.writeLock();
+
+ private int numWrites = 0;
+
+ public int getId(String key) {
+ r.lock();
+ try {
+ Integer v = m.get(key);
+ if (v != null) {
+ return v;
+ }
+ } finally {
+ r.unlock();
+ }
+
+ try {
+ w.lock();
+ Integer v = m.get(key);
+ if (v != null) {
+ return v;
+ }
+ v = m.size()+1;
+ m.put(key, v);
+ write(v, key);
+ numWrites++;
+ return v;
+ } finally {
+ w.unlock();
+ }
+ }
+
+ public int getNumWrites() {
+ return numWrites;
+ }
+
+ //Odd to throw RuntimeException, I know. It should be
+ //catastrophic if this buffer can't write to the db.
+ public abstract void write(int id, String value) throws RuntimeException;
+
+ public abstract void close() throws SQLException;
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/ColInfo.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/ColInfo.java
new file mode 100644
index 0000000..a251e25
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/ColInfo.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+
+import java.sql.Types;
+import java.util.Objects;
+
+public class ColInfo {
+ private final Cols name;
+ private final int type;
+ private final Integer precision;
+ private final String constraints;
+
+ public ColInfo(Cols name, int type) {
+ this(name, type, null, null);
+ }
+
+ public ColInfo(Cols name, int type, String constraints) {
+ this(name, type, null, constraints);
+ }
+
+ public ColInfo(Cols name, int type, Integer precision) {
+ this(name, type, precision, null);
+ }
+
+
+ public ColInfo(Cols name, int type, Integer precision, String constraints) {
+ this.name = name;
+ this.type = type;
+ this.precision = precision;
+ this.constraints = constraints;
+ }
+
+ public int getType() {
+ return type;
+ }
+
+ public Cols getName() {
+ return name;
+ }
+ /**
+ *
+ * @return constraints string or null
+ */
+ public String getConstraints() {
+ return constraints;
+ }
+
+ /**
+ * Gets the precision. This can be null!
+ * @return precision or null
+ */
+ public Integer getPrecision() {
+ return precision;
+ }
+
+ public String getSqlDef() {
+ if (type == Types.VARCHAR){
+ return "VARCHAR("+precision+")";
+ } else if (type == Types.CHAR) {
+ return "CHAR("+precision+")";
+ }
+ switch (type) {
+ case Types.FLOAT :
+ return "FLOAT";
+ case Types.DOUBLE :
+ return "DOUBLE";
+ case Types.BLOB :
+ return "BLOB";
+ case Types.INTEGER :
+ return "INTEGER";
+ case Types.BIGINT :
+ return "BIGINT";
+ case Types.BOOLEAN :
+ return "BOOLEAN";
+ }
+ throw new UnsupportedOperationException("Don't yet recognize a type for: "+type);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ ColInfo colInfo = (ColInfo) o;
+
+ if (type != colInfo.type) {
+ return false;
+ }
+ if (name != colInfo.name) {
+ return false;
+ }
+ if (!Objects.equals(precision, colInfo.precision)) {
+ return false;
+ }
+ return Objects.equals(constraints, colInfo.constraints);
+
+ }
+
+ @Override
+ public int hashCode() {
+ int result = name != null ? name.hashCode() : 0;
+ result = 31 * result + type;
+ result = 31 * result + (precision != null ? precision.hashCode() : 0);
+ result = 31 * result + (constraints != null ? constraints.hashCode() : 0);
+ return result;
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
new file mode 100644
index 0000000..883210f
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+public enum Cols {
+ //container table
+ CONTAINER_ID,
+ FILE_PATH,
+ EXTRACT_FILE_LENGTH,
+
+ EXTRACT_FILE_LENGTH_A, //for comparisons
+ EXTRACT_FILE_LENGTH_B,
+
+ //profile table
+ ID,
+ LENGTH,
+ FILE_NAME,
+ FILE_EXTENSION,
+ ELAPSED_TIME_MILLIS,
+ NUM_METADATA_VALUES,
+ IS_EMBEDDED,
+ EMBEDDED_FILE_PATH,
+ MIME_ID,
+ TIKA_MIME_ID,
+ FILE_MIME_ID,
+ SHA256,
+ MD5,
+ NUM_ATTACHMENTS,
+ HAS_CONTENT,
+
+ //content
+ CONTENT_LENGTH,
+ NUM_UNIQUE_TOKENS,
+ NUM_TOKENS,
+ NUM_UNIQUE_ALPHABETIC_TOKENS,
+ NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens
+ COMMON_TOKENS_LANG, //which language was used for the common tokens metric?
+ NUM_UNIQUE_COMMON_TOKENS,
+ NUM_COMMON_TOKENS,
+ TOP_N_TOKENS,
+ LANG_ID_1,
+ LANG_ID_PROB_1,
+ LANG_ID_2,
+ LANG_ID_PROB_2,
+ TOKEN_ENTROPY_RATE,
+ TOKEN_LENGTH_SUM,
+ TOKEN_LENGTH_MEAN,
+ TOKEN_LENGTH_STD_DEV,
+ UNICODE_CHAR_BLOCKS,
+ NUM_PAGES, //number of pages a document alleges it has
+ CONTENT_TRUNCATED_AT_MAX_LEN, // was the string truncated at AbstractProfiler.MAX_STRING_LENGTH
+
+ //content comparisons
+ TOP_10_UNIQUE_TOKEN_DIFFS_A,
+ TOP_10_UNIQUE_TOKEN_DIFFS_B,
+ TOP_10_MORE_IN_A,
+ TOP_10_MORE_IN_B,
+ OVERLAP,
+ DICE_COEFFICIENT,
+
+ //errors
+ PARSE_ERROR_ID,
+
+ PARSE_ERROR_DESCRIPTION,
+ PARSE_EXCEPTION_DESCRIPTION,
+
+ EXTRACT_EXCEPTION_ID,
+ EXTRACT_EXCEPTION_DESCRIPTION,
+
+
+ //exceptions
+ ORIG_STACK_TRACE,
+ SORT_STACK_TRACE,
+ PARSE_EXCEPTION_ID,
+
+
+ MIME_STRING,//string representation of mime type
+
+ DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
+ DIR_NAME_B,
+
+ //structure tags
+ TAGS_A,
+ TAGS_B,
+ TAGS_DIV,
+ TAGS_I,
+ TAGS_IMG,
+ TAGS_LI,
+ TAGS_P,
+ TAGS_OL,
+ TAGS_TABLE,
+ TAGS_TD,
+ TAGS_TITLE,
+ TAGS_TR,
+ TAGS_UL,
+ TAGS_U,
+ TAGS_PARSE_EXCEPTION, //if there was a SAX|IO|TikaException while parsing the html or xhtml
+ ;
+
+}
+
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/DBBuffer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/DBBuffer.java
new file mode 100644
index 0000000..2cc8cf7
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/DBBuffer.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+
+public class DBBuffer extends AbstractDBBuffer {
+
+ private final PreparedStatement st;
+
+ public DBBuffer(Connection connection, String tableName,
+ String idColumnName, String valueColumnName) throws SQLException {
+ st = connection.prepareStatement("insert into "+tableName+ "( "+
+ idColumnName + ", " + valueColumnName+") values (?,?);");
+ }
+
+ @Override
+ public void write(int id, String value) throws RuntimeException {
+ try {
+ st.clearParameters();
+ st.setInt(1, id);
+ st.setString(2, value);
+ st.execute();
+
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void close() throws SQLException {
+ st.close();
+
+ }
+
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/H2Util.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/H2Util.java
new file mode 100644
index 0000000..87501c6
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/H2Util.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.FilenameUtils;
+
+
+public class H2Util extends JDBCUtil {
+
+ private final static String DRIVER_CLASS = "org.h2.Driver";
+
+ private final Path db;
+
+ public H2Util(Path db) {
+ super(getConnectionString(db, true), DRIVER_CLASS);
+ this.db = db;
+ }
+
+ public static boolean databaseExists(Path db) {
+ String connString = getConnectionString(db, false);
+ try (Connection conn = DriverManager.getConnection(connString)) {
+ } catch (SQLException e) {
+ return false;
+ }
+ return true;
+ }
+ @Override
+ public String getJDBCDriverClass() {
+ return "org.h2.Driver";
+ }
+
+ @Override
+ public boolean dropTableIfExists(Connection conn, String tableName) throws SQLException {
+ Statement st = conn.createStatement();
+ String sql = "drop table if exists "+tableName;
+ boolean success = st.execute(sql);
+ st.close();
+ return success;
+ }
+
+ @Override
+ public String getConnectionString() {
+ return getConnectionString(db, true);
+ }
+
+ private static String getConnectionString(Path db, boolean createDBIfItDoesntExist) {
+ String s = "jdbc:h2:"+ FilenameUtils.separatorsToUnix(db.toAbsolutePath().toString());
+ if (! createDBIfItDoesntExist) {
+ s += ";IFEXISTS=TRUE";
+ }
+ return s;
+ }
+
+ @Override
+ public Set<String> getTables(Connection connection) throws SQLException {
+ String sql = "SHOW TABLES";
+ Statement st = connection.createStatement();
+ ResultSet rs = st.executeQuery(sql);
+ Set<String> tables = new HashSet<>();
+ while (rs.next()) {
+ String table = rs.getString(1);
+ tables.add(table);
+ }
+ return tables;
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/JDBCUtil.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/JDBCUtil.java
new file mode 100644
index 0000000..0211b53
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/JDBCUtil.java
@@ -0,0 +1,316 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app.db;
+
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.Connection;
+import java.sql.DatabaseMetaData;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.sql.Types;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class JDBCUtil {
+ private static final Logger LOG = LoggerFactory.getLogger(JDBCUtil.class);
+
+ public enum CREATE_TABLE {
+ DROP_IF_EXISTS,
+ SKIP_IF_EXISTS,
+ THROW_EX_IF_EXISTS,
+ }
+
+ private final String connectionString;
+ private String driverClass;
+ private Connection connection = null;
+
+ public JDBCUtil(String connectionString, String driverClass) {
+ this.connectionString = connectionString;
+ this.driverClass = driverClass;
+ if (driverClass == null || driverClass.length() == 0) {
+ if (System.getProperty("jdbc.drivers") != null) {
+ //user has specified it on the command line
+ //stop now
+ } else {
+ //try to use the mappings in db.properties to determine the class
+ try (InputStream is = JDBCUtil.class.getResourceAsStream("/db.properties")) {
+ Properties properties = new Properties();
+ properties.load(is);
+ for (String k : properties.stringPropertyNames()) {
+ Matcher m = Pattern.compile("(?i)jdbc:" + k).matcher(connectionString);
+ if (m.find()) {
+ this.driverClass = properties.getProperty(k);
+ }
+ }
+
+ } catch (IOException e) {
+
+ }
+ }
+ }
+ }
+
+ /**
+ * Override this any optimizations you want to do on the db
+ * before writing/reading.
+ *
+ * @return
+ * @throws IOException
+ */
+ public Connection getConnection() throws SQLException {
+ if (connection != null) {
+ return connection;
+ }
+ String connectionString = getConnectionString();
+ String jdbcDriver = getJDBCDriverClass();
+ if (jdbcDriver != null) {
+ try {
+ Class.forName(getJDBCDriverClass());
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ connection = DriverManager.getConnection(connectionString);
+ connection.setAutoCommit(false);
+
+ return connection;
+ }
+
+ /**
+ * JDBC driver class. Override as necessary.
+ *
+ * @return
+ */
+ public String getJDBCDriverClass() {
+ return driverClass;
+ }
+
+
+ public boolean dropTableIfExists(Connection conn, String tableName) throws SQLException {
+ if (containsTable(tableName)) {
+ try (Statement st = conn.createStatement()) {
+ String sql = "drop table " + tableName;
+ return st.execute(sql);
+ }
+ }
+ return true;
+ }
+
+
+ public String getConnectionString() {
+ return connectionString;
+ }
+
+
+ public Set<String> getTables(Connection connection) throws SQLException {
+ Set<String> tables = new HashSet<>();
+
+ DatabaseMetaData dbMeta = connection.getMetaData();
+
+ try (ResultSet rs = dbMeta.getTables(null, null, "%", null)) {
+ while (rs.next()) {
+ tables.add(rs.getString(3).toLowerCase(Locale.US));
+ }
+ }
+ return tables;
+ }
+
+ @Deprecated
+ /**
+ * @deprecated use {@link #batchInsert(PreparedStatement, TableInfo, Map)}
+ */
+ public static int insert(PreparedStatement insertStatement,
+ TableInfo table,
+ Map<Cols, String> data) throws SQLException {
+
+ //clear parameters before setting
+ insertStatement.clearParameters();
+ try {
+ int i = 1;
+ for (ColInfo colInfo : table.getColInfos()) {
+ updateInsertStatement(i, insertStatement, colInfo, data.get(colInfo.getName()));
+ i++;
+ }
+ for (Cols c : data.keySet()) {
+ if (!table.containsColumn(c)) {
+ throw new IllegalArgumentException("Can't add data to " + c +
+ " because it doesn't exist in the table: " + table.getName());
+ }
+ }
+ return insertStatement.executeUpdate();
+ } catch (SQLException e) {
+ LOG.warn("couldn't insert data for this row: {}", e.getMessage());
+ return -1;
+ }
+ }
+
+ public static void batchInsert(PreparedStatement insertStatement,
+ TableInfo table,
+ Map<Cols, String> data) throws SQLException {
+
+ try {
+ int i = 1;
+ for (ColInfo colInfo : table.getColInfos()) {
+ updateInsertStatement(i, insertStatement, colInfo, data.get(colInfo.getName()));
+ i++;
+ }
+ for (Cols c : data.keySet()) {
+ if (!table.containsColumn(c)) {
+ throw new IllegalArgumentException("Can't add data to " + c +
+ " because it doesn't exist in the table: " + table.getName());
+ }
+ }
+ insertStatement.addBatch();
+ } catch (SQLException e) {
+ LOG.warn("couldn't insert data for this row: {}", e.getMessage());
+ }
+ }
+
+ public static void updateInsertStatement(int dbColOffset, PreparedStatement st,
+ ColInfo colInfo, String value) throws SQLException {
+ if (value == null) {
+ st.setNull(dbColOffset, colInfo.getType());
+ return;
+ }
+ try {
+ switch (colInfo.getType()) {
+ case Types.VARCHAR:
+ if (value != null && value.length() > colInfo.getPrecision()) {
+ value = value.substring(0, colInfo.getPrecision());
+ LOG.warn("truncated varchar value in {} : {}", colInfo.getName(), value);
+ }
+ //postgres doesn't allow \0000
+ value = value.replaceAll("\u0000", " ");
+ st.setString(dbColOffset, value);
+ break;
+ case Types.CHAR:
+ //postgres doesn't allow \0000
+ value = value.replaceAll("\u0000", " ");
+ st.setString(dbColOffset, value);
+ break;
+ case Types.DOUBLE:
+ st.setDouble(dbColOffset, Double.parseDouble(value));
+ break;
+ case Types.FLOAT:
+ st.setDouble(dbColOffset, Float.parseFloat(value));
+ break;
+ case Types.INTEGER:
+ st.setInt(dbColOffset, Integer.parseInt(value));
+ break;
+ case Types.BIGINT:
+ st.setLong(dbColOffset, Long.parseLong(value));
+ break;
+ case Types.BOOLEAN:
+ st.setBoolean(dbColOffset, Boolean.parseBoolean(value));
+ break;
+ default:
+ throw new UnsupportedOperationException("Don't yet support type: " + colInfo.getType());
+ }
+ } catch (NumberFormatException e) {
+ if (!"".equals(value)) {
+ LOG.warn("number format exception: {} : {}", colInfo.getName(), value);
+ }
+ st.setNull(dbColOffset, colInfo.getType());
+ } catch (SQLException e) {
+ LOG.warn("sqlexception: {} : {}", colInfo, value);
+ st.setNull(dbColOffset, colInfo.getType());
+ }
+ }
+
+ public void createTables(List<TableInfo> tableInfos, CREATE_TABLE createTable) throws SQLException, IOException {
+
+ Connection conn = getConnection();
+ for (TableInfo tableInfo : tableInfos) {
+
+ if (createTable.equals(CREATE_TABLE.DROP_IF_EXISTS)) {
+ dropTableIfExists(conn, tableInfo.getName());
+ } else if (createTable.equals(CREATE_TABLE.SKIP_IF_EXISTS)) {
+ if (containsTable(tableInfo.getName())) {
+ continue;
+ }
+ }
+ createTable(conn, tableInfo);
+ }
+ conn.commit();
+
+ }
+
+ public boolean containsTable(String tableName) throws SQLException {
+ Connection connection = getConnection();
+ Set<String> tables = getTables(connection);
+ if (tables.contains(normalizeTableName(tableName))) {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Override for custom behavior
+ *
+ * @param tableName
+ * @return
+ */
+ String normalizeTableName(String tableName) {
+ tableName = tableName.toLowerCase(Locale.US);
+ return tableName;
+ }
+
+ //does not close the connection
+ private void createTable(Connection conn, TableInfo tableInfo) throws SQLException {
+ StringBuilder createSql = new StringBuilder();
+ createSql.append("CREATE TABLE " + tableInfo.getName());
+ createSql.append("(");
+
+ int last = 0;
+ for (ColInfo col : tableInfo.getColInfos()) {
+ last++;
+ if (last > 1) {
+ createSql.append(", ");
+ }
+ createSql.append(col.getName());
+ createSql.append(" ");
+ createSql.append(col.getSqlDef());
+ String constraints = col.getConstraints();
+ if (constraints != null) {
+ createSql.append(" ");
+ createSql.append(constraints);
+ }
+ }
+ createSql.append(")");
+ try (Statement st = conn.createStatement()) {
+ st.execute(createSql.toString());
+ st.close();
+ }
+ conn.commit();
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java
new file mode 100644
index 0000000..235f282
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/MimeBuffer.java
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Types;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+
+
+public class MimeBuffer extends AbstractDBBuffer {
+
+ private final PreparedStatement st;
+ private final TikaConfig config;
+ private final Connection connection;
+
+
+ public MimeBuffer(Connection connection, TableInfo mimeTable, TikaConfig config) throws SQLException {
+ st = connection.prepareStatement("insert into " + mimeTable.getName() + "( " +
+ Cols.MIME_ID.name() + ", " +
+ Cols.MIME_STRING.name() + ", " +
+ Cols.FILE_EXTENSION.name() + ") values (?,?,?)");
+ this.config = config;
+ this.connection = connection;
+ }
+
+ @Override
+ public void write(int id, String value) throws RuntimeException {
+ try {
+ st.clearParameters();
+ st.setInt(1, id);
+ st.setString(2, value);
+ try {
+ String ext = MimeUtil.getExtension(value, config);
+ if (ext == null || ext.length() == 0) {
+ st.setNull(3, Types.VARCHAR);
+ } else {
+ st.setString(3, ext);
+ }
+ } catch (MimeTypeException e) {
+ st.setNull(3, Types.VARCHAR);
+ }
+ st.execute();
+
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ @Override
+ public void close() throws SQLException {
+ st.close();
+ connection.commit();
+ }
+
+ private static class MimeUtil {
+ //TODO: see if MimeType now works for these
+ private static final String APPLICATION = "application";
+ private static final String TEXT = "text";
+ private static final String HTML = "html";
+ private static final String XML = "xml";
+ private static final String XHTML_XML = "xhtml+xml";
+ private static final String CSS = "css";
+ private static final String CSV = "csv";
+ private static final String PLAIN = "plain";
+ private static final String EMPTY_STRING = "";
+
+ /**
+ * Utility method to convert from a string value representing a content type
+ * (e.g. "application/pdf") into the most common extension for that file type
+ * (e.g. "pdf").
+ * <p>
+ * This will has special handling for texty filetypes whose MimeTypes
+ * don't currently return anything for {@link MimeType#getExtension};
+ *
+ * @param contentType string representing a content type, for example: "application/pdf"
+ * @param config config from which to get MimeRepository
+ * @return extension or empty string
+ * @throws MimeTypeException thrown if MimeTypes can't parse the contentType
+ */
+ public static String getExtension(String contentType, TikaConfig config)
+ throws MimeTypeException {
+ MimeTypes types = config.getMimeRepository();
+ MimeType mime = types.forName(contentType);
+ return getExtension(mime);
+ }
+
+ public static String getExtension(MimeType mime) {
+
+ String ext = mime.getExtension();
+ if (ext.startsWith(".")) {
+ ext = ext.substring(1);
+ }
+
+ //special handling for text/html/xml
+ if (ext.length() == 0) {
+ ext = tryTextyTypes(mime.getType());
+ }
+ return ext;
+ }
+
+ private static String tryTextyTypes(MediaType mediaType) {
+
+ String type = mediaType.getType();
+ String subtype = mediaType.getSubtype();
+ if (type.equals(TEXT)) {
+ if (subtype.equals(HTML)) {
+ return HTML;
+ } else if (subtype.equals(PLAIN)) {
+ return "txt";
+ } else if (subtype.equals(CSS)) {
+ return CSS;
+ } else if (subtype.equals(CSV)) {
+ return CSV;
+ }
+ } else if (type.equals(APPLICATION)) {
+ if (subtype.equals(XML)) {
+ return XML;
+ } else if (subtype.equals(XHTML_XML)) {
+ return "html";
+ }
+ }
+ return EMPTY_STRING;
+ }
+ }
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/TableInfo.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/TableInfo.java
new file mode 100644
index 0000000..a99cea4
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/TableInfo.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.db;
+
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+public class TableInfo {
+
+ private final String name;
+ private final List<ColInfo> colInfos = new ArrayList<>();
+ private final Set<Cols> colNames = new HashSet<>();
+
+ private String prefix;
+
+ public TableInfo(String name, ColInfo... cols) {
+ Collections.addAll(colInfos, cols);
+ Collections.unmodifiableList(colInfos);
+ this.name = name;
+ for (ColInfo c : colInfos) {
+ assert (!colNames.contains(c.getName()));
+ colNames.add(c.getName());
+ }
+ }
+
+ public TableInfo(String name, List<ColInfo> cols) {
+ colInfos.addAll(cols);
+ Collections.unmodifiableList(colInfos);
+ this.name = name;
+ for (ColInfo c : colInfos) {
+ assert (!colNames.contains(c.getName()));
+ colNames.add(c.getName());
+ }
+ }
+
+ public String getName() {
+ if (prefix == null) {
+ return name;
+ }
+ return prefix+name;
+ }
+
+ public void setNamePrefix(String prefix) {
+ this.prefix = prefix;
+ }
+
+ public List<ColInfo> getColInfos() {
+ return colInfos;
+ }
+
+ public boolean containsColumn(Cols cols) {
+ return colNames.contains(cols);
+ }
+}
+
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/DBWriter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/DBWriter.java
new file mode 100644
index 0000000..4a304c2
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/DBWriter.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.io;
+
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.commons.io.IOExceptionWithCause;
+import org.apache.tika.eval.app.db.ColInfo;
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.eval.app.db.MimeBuffer;
+import org.apache.tika.eval.app.db.TableInfo;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is still in its early stages. The idea is to
+ * get something working with h2 and then add to that
+ * as necessary.
+ *
+ * Beware, this deletes the db file with each initialization.
+ *
+ * Each thread must construct its own DBWriter because each
+ * DBWriter creates its own PreparedStatements at initialization.
+ */
+public class DBWriter implements IDBWriter {
+
+ private static final Logger LOG = LoggerFactory.getLogger(DBWriter.class);
+
+ private static final AtomicInteger WRITER_ID = new AtomicInteger();
+ private final Long commitEveryXRows = 10000L;
+ //private final Long commitEveryXMS = 60000L;
+
+ private final Connection conn;
+ private final JDBCUtil dbUtil;
+ private final MimeBuffer mimeBuffer;
+ private final int myId = WRITER_ID.getAndIncrement();
+
+ //<tableName, preparedStatement>
+ private final Map<String, PreparedStatement> inserts = new HashMap<>();
+ private final Map<String, LastInsert> lastInsertMap = new HashMap<>();
+ public DBWriter(Connection connection, List<TableInfo> tableInfos, JDBCUtil dbUtil, MimeBuffer mimeBuffer)
+ throws IOException, SQLException {
+
+ this.conn = connection;
+ this.mimeBuffer = mimeBuffer;
+ this.dbUtil = dbUtil;
+ for (TableInfo tableInfo : tableInfos) {
+ try {
+ PreparedStatement st = createPreparedInsert(tableInfo);
+ inserts.put(tableInfo.getName(), st);
+ lastInsertMap.put(tableInfo.getName(), new LastInsert());
+ } catch (SQLException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ }
+
+ public int getMimeId(String mimeString) {
+ return mimeBuffer.getId(mimeString);
+ }
+
+ private PreparedStatement createPreparedInsert(TableInfo tableInfo) throws SQLException {
+ StringBuilder sb = new StringBuilder();
+ sb.append("INSERT INTO ").append(tableInfo.getName());
+ sb.append("(");
+ int i = 0;
+ for (ColInfo c : tableInfo.getColInfos()) {
+ if (i++ > 0) {
+ sb.append(", ");
+ }
+ sb.append(c.getName());
+ }
+ sb.append(") ");
+
+ sb.append("VALUES");
+ sb.append("(");
+ for (int j = 0; j < i; j++) {
+ if (j > 0) {
+ sb.append(", ");
+ }
+ sb.append("?");
+ }
+ sb.append(")");
+
+ return conn.prepareStatement(sb.toString());
+ }
+
+
+ @Override
+ public void writeRow(TableInfo table, Map<Cols, String> data) throws IOException {
+ try {
+ PreparedStatement p = inserts.get(table.getName());
+ if (p == null) {
+ throw new RuntimeException("Failed to create prepared statement for: "+
+ table.getName());
+ }
+ dbUtil.batchInsert(p, table, data);
+ LastInsert lastInsert = lastInsertMap.get(table.getName());
+ lastInsert.rowCount++;
+ long elapsed = System.currentTimeMillis()-lastInsert.lastInsert;
+ if (
+ //elapsed > commitEveryXMS ||
+ lastInsert.rowCount % commitEveryXRows == 0) {
+ LOG.info("writer ({}) on table ({}) is committing after {} rows and {} ms", myId,
+ table.getName(),
+ lastInsert.rowCount, elapsed);
+ p.executeBatch();
+ conn.commit();
+ lastInsert.lastInsert = System.currentTimeMillis();
+ }
+ } catch (SQLException e) {
+ throw new IOException(e);
+ }
+ }
+
+ /**
+ * This closes the writer by executing batch and
+ * committing changes. This DOES NOT close the connection
+ * @throws IOException
+ */
+ public void close() throws IOException {
+ for (PreparedStatement p : inserts.values()) {
+ try {
+ p.executeBatch();
+ } catch (SQLException e) {
+ throw new IOExceptionWithCause(e);
+ }
+ }
+ try {
+ conn.commit();
+ } catch (SQLException e){
+ throw new IOExceptionWithCause(e);
+ }
+ }
+
+ private class LastInsert {
+ private long lastInsert = System.currentTimeMillis();
+ private long rowCount = 0;
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java
new file mode 100644
index 0000000..eb6be14
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReader.java
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.io;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.sax.ToTextContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class ExtractReader {
+ private static final Logger LOG = LoggerFactory.getLogger(ExtractReader.class);
+
+ public static final long IGNORE_LENGTH = -1L;
+
+ public enum ALTER_METADATA_LIST {
+ AS_IS, //leave the metadata list as is
+ FIRST_ONLY, //take only the metadata list for the "container" document
+ CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into the first
+ }
+
+ private TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+
+ private final ALTER_METADATA_LIST alterMetadataList;
+ private final long minExtractLength;
+ private final long maxExtractLength;
+
+ /**
+ * Reads full extract, no modification of metadata list, no min or max extract length checking
+ */
+ public ExtractReader() {
+ this(ALTER_METADATA_LIST.AS_IS, IGNORE_LENGTH, IGNORE_LENGTH);
+ }
+
+ public ExtractReader(ALTER_METADATA_LIST alterMetadataList) {
+ this(alterMetadataList, IGNORE_LENGTH, IGNORE_LENGTH);
+ }
+
+ public ExtractReader(ALTER_METADATA_LIST alterMetadataList, long minExtractLength, long maxExtractLength) {
+ this.alterMetadataList = alterMetadataList;
+ this.minExtractLength = minExtractLength;
+ this.maxExtractLength = maxExtractLength;
+ if (maxExtractLength > IGNORE_LENGTH && minExtractLength >= maxExtractLength) {
+ throw new IllegalArgumentException("minExtractLength("+minExtractLength+
+ ") must be < maxExtractLength("+maxExtractLength+")");
+ }
+ }
+ public List<Metadata> loadExtract(Path extractFile) throws ExtractReaderException {
+
+ List<Metadata> metadataList = null;
+ if (extractFile == null || !Files.isRegularFile(extractFile)) {
+ throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
+ }
+
+ FileSuffixes fileSuffixes = parseSuffixes(extractFile.getFileName().toString());
+ if (fileSuffixes.format == null) {
+ throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX);
+ }
+ if (! Files.isRegularFile(extractFile)) {
+ throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
+ }
+
+ long length = -1L;
+ try {
+ length = Files.size(extractFile);
+ } catch (IOException e) {
+ throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
+ }
+
+ if (length == 0L) {
+ throw new ExtractReaderException(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE);
+ }
+
+ if (minExtractLength > IGNORE_LENGTH && length < minExtractLength) {
+ throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_SHORT);
+ }
+ if (maxExtractLength > IGNORE_LENGTH && length > maxExtractLength) {
+ throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_LONG);
+ }
+
+ Reader reader = null;
+ InputStream is = null;
+ try {
+ is = Files.newInputStream(extractFile);
+ if (fileSuffixes.compression != null) {
+ if (fileSuffixes.compression.equals("bz2")) {
+ is = new BZip2CompressorInputStream(is);
+ } else if (fileSuffixes.compression.equals("gz")
+ || fileSuffixes.compression.equals("gzip")) {
+ is = new GzipCompressorInputStream(is);
+ } else if (fileSuffixes.compression.equals("zip")) {
+ is = new ZCompressorInputStream(is);
+ } else {
+ LOG.warn("Can't yet process compression of type: {}", fileSuffixes.compression);
+ return metadataList;
+ }
+ }
+ reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
+ } catch (IOException e) {
+ throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
+ }
+
+ try {
+ if (fileSuffixes.format == FileSuffixes.FORMAT.JSON) {
+ metadataList = JsonMetadataList.fromJson(reader);
+ if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) {
+ while (metadataList.size() > 1) {
+ metadataList.remove(metadataList.size()-1);
+ }
+ } else if (alterMetadataList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) &&
+ metadataList.size() > 1) {
+ StringBuilder sb = new StringBuilder();
+ Metadata containerMetadata = metadataList.get(0);
+ for (Metadata m : metadataList) {
+ String c = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+ if (c != null) {
+ sb.append(c);
+ sb.append(" ");
+ }
+ }
+ containerMetadata.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, sb.toString());
+ while (metadataList.size() > 1) {
+ metadataList.remove(metadataList.size()-1);
+ }
+ }
+ } else {
+ metadataList = generateListFromTextFile(reader, fileSuffixes);
+ }
+ } catch (IOException e) {
+ throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
+ } catch (TikaException e) {
+ throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_PARSE_EXCEPTION);
+ } finally {
+ IOUtils.closeQuietly(reader);
+ IOUtils.closeQuietly(is);
+ }
+ return metadataList;
+ }
+
+ private List<Metadata> generateListFromTextFile(Reader reader,
+ FileSuffixes fileSuffixes) throws IOException {
+ List<Metadata> metadataList = new ArrayList<>();
+ String content = IOUtils.toString(reader);
+ Metadata m = new Metadata();
+ m.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content);
+ if (fileSuffixes.format == FileSuffixes.FORMAT.HTML) {
+ m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToXMLContentHandler.class.getSimpleName());
+ } else if (fileSuffixes.format == FileSuffixes.FORMAT.TXT) {
+ m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToTextContentHandler.class.getSimpleName());
+ }
+ //Let's hope the file name has a suffix that can
+ //be used to determine the mime. Could be wrong or missing,
+ //but better than nothing.
+ m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileSuffixes.originalFileName);
+
+ MediaType mimeType = tikaConfig.getMimeRepository().detect(null, m);
+ if (mimeType != null) {
+ m.set(Metadata.CONTENT_TYPE, mimeType.toString());
+ }
+ metadataList.add(m);
+ return metadataList;
+
+ }
+
+ protected static FileSuffixes parseSuffixes(String fName) {
+ FileSuffixes fileSuffixes = new FileSuffixes();
+ if (fName == null) {
+ return fileSuffixes;
+ }
+ Matcher m = Pattern.compile("(?i)^(.*?)\\.(json|txt|x?html)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName);
+ if (m.find()) {
+ fileSuffixes.originalFileName = m.group(1);
+ fileSuffixes.setFormat(m.group(2));
+ fileSuffixes.compression = m.group(3);
+ }
+ return fileSuffixes;
+ }
+
+ private static class FileSuffixes {
+
+ enum FORMAT {
+ TXT,
+ HTML,
+ JSON
+ }
+ String compression;
+ FORMAT format;
+ String originalFileName;
+
+ public void setFormat(String fmt) {
+ String lc = fmt.toLowerCase(Locale.ENGLISH);
+ if (lc.equals("json")) {
+ format = FORMAT.JSON;
+ } else if (lc.equals("txt")) {
+ format = FORMAT.TXT;
+ } else if (lc.contains("html")) {
+ format = FORMAT.HTML;
+ } else {
+ throw new IllegalArgumentException("extract must end in .json, .txt or .xhtml");
+ }
+ }
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReaderException.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReaderException.java
new file mode 100644
index 0000000..61e1f35
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/ExtractReaderException.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.io;
+
+import java.io.IOException;
+
+/**
+ * Exception when trying to read extract
+ */
+public class ExtractReaderException extends IOException {
+
+ public enum TYPE {
+ //what do you see when you look at the extract file
+ NO_EXTRACT_FILE,
+ ZERO_BYTE_EXTRACT_FILE,
+ IO_EXCEPTION,
+ EXTRACT_PARSE_EXCEPTION,
+ EXTRACT_FILE_TOO_SHORT,
+ EXTRACT_FILE_TOO_LONG,
+ INCORRECT_EXTRACT_FILE_SUFFIX;//extract file must have suffix of .json or .txt,
+ // optionally followed by gzip, zip or bz2
+ }
+
+ private final TYPE type;
+
+ public ExtractReaderException(TYPE exceptionType) {
+ this.type = exceptionType;
+ }
+
+ public TYPE getType() {
+ return type;
+ }
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/IDBWriter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/IDBWriter.java
new file mode 100644
index 0000000..bb1d4ae
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/IDBWriter.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app.io;
+
+
+import org.apache.tika.eval.app.db.Cols;
+import org.apache.tika.eval.app.db.TableInfo;
+
+import java.io.IOException;
+import java.util.Map;
+
+public interface IDBWriter {
+ public void writeRow(TableInfo table, Map<Cols, String> data) throws IOException;
+ public void close() throws IOException;
+ public int getMimeId(String mimeString);
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogMsgHandler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogMsgHandler.java
new file mode 100644
index 0000000..547784b
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogMsgHandler.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.io;
+
+import java.io.IOException;
+import java.sql.SQLException;
+
+import org.apache.log4j.Level;
+
+public interface XMLLogMsgHandler {
+ public void handleMsg(Level level, String xml) throws IOException, SQLException;
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogReader.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogReader.java
new file mode 100644
index 0000000..408ed97
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/io/XMLLogReader.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.io;
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.sql.SQLException;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.log4j.Level;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class XMLLogReader {
+ private static final Logger LOG = LoggerFactory.getLogger(XMLLogReader.class);
+ //class that wraps a logger's xml output
+ //into a single xml parseable input stream.
+
+ public void read(InputStream xmlLogFileIs, XMLLogMsgHandler handler) throws XMLStreamException {
+ InputStream is = new LogXMLWrappingInputStream(xmlLogFileIs);
+ XMLInputFactory factory = XMLReaderUtils.getXMLInputFactory();
+ XMLStreamReader reader = factory.createXMLStreamReader(is);
+
+ Level level = null;
+ while (reader.hasNext()) {
+ reader.next();
+ switch (reader.getEventType()) {
+ case XMLStreamConstants.START_ELEMENT :
+ if ("event".equals(reader.getLocalName())) {
+ level = Level.toLevel(reader.getAttributeValue("", "level"), Level.DEBUG);
+ } else if ("message".equals(reader.getLocalName())) {
+ try {
+ handler.handleMsg(level, reader.getElementText());
+ } catch (IOException e) {
+ LOG.warn("Error parsing: {}", reader.getElementText());
+ } catch (SQLException e) {
+ LOG.warn("SQLException: {}", e.getMessage());
+ }
+ }
+ break;
+ case XMLStreamConstants.END_ELEMENT :
+ if ("event".equals(reader.getLocalName())) {
+ level = null;
+ } else if ("message".equals(reader.getLocalName())) {
+ //do we care any more?
+ }
+ break;
+ };
+ }
+ }
+
+
+
+ class LogXMLWrappingInputStream extends InputStream {
+ //plagiarized from log4j's chainsaw
+ private final static String HEADER =
+ "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ + "<log4j:eventSet version=\"1.2\" "
+ + "xmlns:log4j=\"http://jakarta.apache.org/log4j/\">";
+ private static final String FOOTER = "</log4j:eventSet>";
+
+ private InputStream[] streams;
+ int currentStreamIndex = 0;
+
+ private LogXMLWrappingInputStream(InputStream xmlLogFileIs){
+ streams = new InputStream[3];
+ streams[0] = new ByteArrayInputStream(HEADER.getBytes(StandardCharsets.UTF_8));
+ streams[1] = xmlLogFileIs;
+ streams[2] = new ByteArrayInputStream(FOOTER.getBytes(StandardCharsets.UTF_8));
+
+ }
+
+ @Override
+ public int read() throws IOException {
+ int c = streams[currentStreamIndex].read();
+ if (c < 0) {
+ IOUtils.closeQuietly(streams[currentStreamIndex]);
+ while (currentStreamIndex < streams.length-1) {
+ currentStreamIndex++;
+ int tmpC = streams[currentStreamIndex].read();
+ if (tmpC < 0) {
+ IOUtils.closeQuietly(streams[currentStreamIndex]);
+ } else {
+ return tmpC;
+ }
+ }
+ return -1;
+ }
+ return c;
+ }
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/Report.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/Report.java
new file mode 100644
index 0000000..58d28a8
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/Report.java
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.reports;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.ResultSetMetaData;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.VerticalAlignment;
+import org.apache.poi.xssf.streaming.SXSSFSheet;
+import org.apache.poi.xssf.streaming.SXSSFWorkbook;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class represents a single report.
+ */
+public class Report {
+ private static final Logger LOG = LoggerFactory.getLogger(Report.class);
+
+ final String NULL_VALUE = "";//TODO: make this configurable!!!
+ Map<String, XSLXCellFormatter> cellFormatters = new HashMap<>();
+ private XLSXNumFormatter defaultDoubleFormatter = new XLSXNumFormatter("0.000");
+ private XLSXNumFormatter defaultIntegerFormatter = new XLSXNumFormatter("0");
+ private CellStyle sqlCellStyle;
+
+ String sql;
+ String reportFilename;
+ boolean includeSql = true;
+
+ String reportName;
+
+ public void writeReport(Connection c, Path reportsRoot) throws SQLException, IOException {
+ LOG.info("Writing report: {} to {}", reportName, reportFilename);
+ dumpXLSX(c, reportsRoot);
+ }
+
+ private void dumpXLSX(Connection c, Path reportsRoot) throws IOException, SQLException {
+ Statement st = c.createStatement();
+ Path out = reportsRoot.resolve(reportFilename);
+ Files.createDirectories(out.getParent());
+
+ SXSSFWorkbook wb = new SXSSFWorkbook(new XSSFWorkbook(), 100, true, true);
+ wb.setCompressTempFiles(true);
+ defaultIntegerFormatter.reset(wb.getXSSFWorkbook());
+ defaultDoubleFormatter.reset(wb.getXSSFWorkbook());
+ sqlCellStyle = wb.createCellStyle();
+ sqlCellStyle.setVerticalAlignment(VerticalAlignment.TOP);
+ sqlCellStyle.setWrapText(true);
+
+
+ try {
+ dumpReportToWorkbook(st, wb);
+ } finally {
+ try (OutputStream os = Files.newOutputStream(out)) {
+ wb.write(os);
+ } finally {
+ wb.dispose();
+ }
+ }
+ }
+
+ private void dumpReportToWorkbook(Statement st, SXSSFWorkbook wb) throws IOException, SQLException {
+ ResultSet rs = st.executeQuery(sql);
+
+ SXSSFSheet sheet = wb.createSheet("tika-eval Report");
+ sheet.trackColumnForAutoSizing(0);
+
+ int rowCount = 0;
+ ResultSetMetaData meta = rs.getMetaData();
+ Set<String> colNames = new HashSet<>();
+
+ Row xssfRow = sheet.createRow(rowCount++);
+ //write headers and cache them to check against styles
+ for (int i = 1; i <= meta.getColumnCount(); i++) {
+ Cell cell = xssfRow.createCell(i-1);
+ cell.setCellValue(meta.getColumnLabel(i));
+ colNames.add(meta.getColumnLabel(i));
+ }
+
+ ResultSetMetaData resultSetMetaData = rs.getMetaData();
+ while (rs.next()) {
+ xssfRow = sheet.createRow(rowCount++);
+ for (int i = 1; i <= meta.getColumnCount(); i++) {
+ Cell cell = xssfRow.createCell(i-1);
+ XSLXCellFormatter formatter = cellFormatters.get(meta.getColumnLabel(i));
+ if (formatter == null) {
+ formatter = getDefaultFormatter(resultSetMetaData.getColumnType(i));
+ }
+ if (formatter != null) {
+ formatter.applyStyleAndValue(i, rs, cell);
+ } else {
+ writeCell(meta, i, rs, cell);
+ }
+ }
+ }
+ sheet.autoSizeColumn(0);
+
+ if (!includeSql) {
+ return;
+ }
+
+ SXSSFSheet sqlSheet = wb.createSheet("tika-eval SQL");
+ sqlSheet.setColumnWidth(0, 100*250);
+ Row sqlRow = sqlSheet.createRow(0);
+ short height = 5000;
+ sqlRow.setHeight(height);
+ Cell cell = sqlRow.createCell(0);
+ cell.setCellStyle(sqlCellStyle);
+
+ cell.setCellValue(sql.trim());//.replaceAll("[\r\n]+", "\r\n"));
+ }
+
+ private XSLXCellFormatter getDefaultFormatter(int columnType) {
+ switch (columnType) {
+ case Types.INTEGER :
+ return defaultIntegerFormatter;
+ case Types.DOUBLE:
+ case Types.FLOAT:
+ case Types.DECIMAL:
+ return defaultDoubleFormatter;
+ default:
+ return null;
+ }
+ }
+
+ private void writeCell(ResultSetMetaData meta, int colIndex, ResultSet rs,
+ Cell cell) throws SQLException {
+
+ switch(meta.getColumnType(colIndex)) {
+ //fall through on numerics
+ case Types.BIGINT:
+ case Types.SMALLINT:
+ case Types.INTEGER:
+ case Types.DOUBLE:
+ case Types.FLOAT:
+ case Types.DECIMAL:
+ case Types.REAL:
+ case Types.NUMERIC:
+ double dbl = rs.getDouble(colIndex);
+ if (rs.wasNull()) {
+ cell.setCellValue(NULL_VALUE);
+ } else {
+ cell.setCellValue(dbl);
+ }
+ break;
+ //fall through strings
+ case Types.BOOLEAN:
+ case Types.CHAR:
+ case Types.VARCHAR:
+ case Types.LONGNVARCHAR:
+ String val = rs.getString(colIndex);
+ if (rs.wasNull()) {
+ cell.setCellValue(NULL_VALUE);
+ } else {
+ cell.setCellValue(val);
+ }
+ break;
+ default:
+ if (rs.wasNull()) {
+ cell.setCellValue(NULL_VALUE);
+ } else {
+ cell.setCellValue(rs.getString(colIndex));
+ }
+ LOG.warn("Couldn't find type for: {}. Defaulting to String", meta.getColumnType(colIndex));
+ }
+ }
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
new file mode 100644
index 0000000..4f765d1
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/ResultsReporter.java
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.reports;
+
+
+import javax.xml.parsers.DocumentBuilder;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.sql.Connection;
+import java.sql.DatabaseMetaData;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.poi.common.usermodel.HyperlinkType;
+import org.apache.tika.eval.app.ExtractComparer;
+import org.apache.tika.eval.app.ExtractProfiler;
+import org.apache.tika.eval.app.db.H2Util;
+import org.apache.tika.eval.app.db.JDBCUtil;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+public class ResultsReporter {
+ private static final Logger LOG = LoggerFactory.getLogger(ResultsReporter.class);
+
+ private static Options OPTIONS;
+
+ static {
+ OPTIONS = new Options();
+ OPTIONS.addOption("rd", "reportsDir", true, "directory for the reports. " +
+ "If not specified, will write to 'reports'" +
+ "BEWARE: Will overwrite existing reports without warning!")
+ .addOption("rf", "reportsFile", true, "xml specifying sql to call for the reports." +
+ "If not specified, will use default reports in resources/tika-eval-*-config.xml")
+ .addOption("db", true, "default database (in memory H2). Specify a file name for the H2 database.")
+ .addOption("jdbc", true, "EXPERT: full jdbc connection string. Specify this or use -db <h2db_name>")
+ .addOption("jdbcdriver", true, "EXPERT: specify the jdbc driver class if all else fails")
+ .addOption("tablePrefix", true, "EXPERT: if not using the default tables, specify your table name prefix");
+
+ }
+
+ public static void USAGE() {
+ HelpFormatter helpFormatter = new HelpFormatter();
+ helpFormatter.printHelp(
+ 80,
+ "java -jar tika-eval-x.y.jar Report -db mydb [-rd myreports] [-rf myreports.xml]",
+ "Tool: Report",
+ ResultsReporter.OPTIONS,
+ "Note: for h2 db, do not include the .mv.db at the end of the db name.");
+
+ }
+
+
+ List<String> before = new ArrayList<>();
+ List<String> after = new ArrayList<>();
+ List<Report> reports = new ArrayList<>();
+
+
+ private void addBefore(String b) {
+ before.add(b);
+ }
+
+ private void addAfter(String a) {
+ after.add(a);
+ }
+
+ private void addReport(Report r) {
+ reports.add(r);
+ }
+
+ public static ResultsReporter build(Path p) throws Exception {
+
+ ResultsReporter r = new ResultsReporter();
+
+ DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder();
+ Document doc;
+ try (InputStream is = Files.newInputStream(p)) {
+ doc = docBuilder.parse(is);
+ }
+ Node docElement = doc.getDocumentElement();
+ assert (docElement.getNodeName().equals("reports"));
+ NodeList children = docElement.getChildNodes();
+ for (int i = 0; i < children.getLength(); i++) {
+ Node n = children.item(i);
+ if ("before".equals(n.getNodeName())) {
+ for (String before : getSql(n)) {
+ r.addBefore(before);
+ }
+ } else if ("after".equals(n.getNodeName())) {
+ for (String after : getSql(n)) {
+ r.addAfter(after);
+ }
+ } else if ("report".equals(n.getNodeName())) {
+ Report report = buildReport(n);
+ r.addReport(report);
+ }
+ }
+
+ return r;
+ }
+
+ private static Report buildReport(Node n) {
+ NodeList children = n.getChildNodes();
+ Report r = new Report();
+ NamedNodeMap attrs = n.getAttributes();
+
+ r.includeSql = Boolean.parseBoolean(attrs.getNamedItem("includeSql").getNodeValue());
+ r.reportFilename = attrs.getNamedItem("reportFilename").getNodeValue();
+ r.reportName = attrs.getNamedItem("reportName").getNodeValue();
+
+ for (int i = 0; i < children.getLength(); i++) {
+ Node child = children.item(i);
+ if (child.getNodeType() != 1) {
+ continue;
+ }
+ if ("sql".equals(child.getNodeName())) {
+ if (r.sql != null) {
+ throw new IllegalArgumentException("Can only have one sql statement per report");
+ }
+ r.sql = child.getTextContent();
+ } else if ("colformats".equals(child.getNodeName())) {
+ r.cellFormatters = getCellFormatters(child);
+ } else {
+ throw new IllegalArgumentException("Not expecting to see:" + child.getNodeName());
+ }
+ }
+ return r;
+ }
+
+ private static Map<String, XSLXCellFormatter> getCellFormatters(Node n) {
+ NodeList children = n.getChildNodes();
+ Map<String, XSLXCellFormatter> ret = new HashMap<>();
+ for (int i = 0; i < children.getLength(); i++) {
+ Node child = children.item(i);
+ if (child.getNodeType() != 1) {
+ continue;
+ }
+ NamedNodeMap attrs = child.getAttributes();
+ String columnName = attrs.getNamedItem("name").getNodeValue();
+ assert (!ret.containsKey(columnName));
+ String type = attrs.getNamedItem("type").getNodeValue();
+ if ("numberFormatter".equals(type)) {
+ String format = attrs.getNamedItem("format").getNodeValue();
+ XSLXCellFormatter f = new XLSXNumFormatter(format);
+ ret.put(columnName, f);
+ } else if ("urlLink".equals(type)) {
+ String base = "";
+ Node baseNode = attrs.getNamedItem("base");
+ if (baseNode != null) {
+ base = baseNode.getNodeValue();
+ }
+ XLSXHREFFormatter f = new XLSXHREFFormatter(base, HyperlinkType.URL);
+ ret.put(columnName, f);
+ } else if ("fileLink".equals(type)) {
+ String base = "";
+ Node baseNode = attrs.getNamedItem("base");
+ if (baseNode != null) {
+ base = baseNode.getNodeValue();
+ }
+ XLSXHREFFormatter f = new XLSXHREFFormatter(base, HyperlinkType.FILE);
+ ret.put(columnName, f);
+ }
+ }
+ return ret;
+ }
+
+ private static List<String> getSql(Node n) {
+ List<String> ret = new ArrayList<>();
+
+ NodeList children = n.getChildNodes();
+
+ for (int i = 0; i < children.getLength(); i++) {
+ Node child = children.item(i);
+ if (child.getNodeType() != 1) {
+ continue;
+ }
+ ret.add(child.getTextContent());
+ }
+ return ret;
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ DefaultParser defaultCLIParser = new DefaultParser();
+ CommandLine commandLine = null;
+ try {
+ commandLine = defaultCLIParser.parse(OPTIONS, args);
+ } catch (ParseException e) {
+ System.out.println(e.getMessage());
+ USAGE();
+ return;
+ }
+ JDBCUtil dbUtil = null;
+ if (commandLine.hasOption("db")) {
+ String dbString = commandLine.getOptionValue("db");
+ if (dbString.endsWith(".mv.db")) {
+ dbString = dbString.substring(0, dbString.length()-6);
+ LOG.debug("trimming .mv.db from db name");
+ }
+ Path db = Paths.get(dbString);
+ if (!H2Util.databaseExists(db)) {
+ throw new RuntimeException("I'm sorry, but I couldn't find this h2 database: " + db);
+ }
+ dbUtil = new H2Util(db);
+ } else if (commandLine.hasOption("jdbc")) {
+ String driverClass = null;
+ if (commandLine.hasOption("jdbcdriver")) {
+ driverClass = commandLine.getOptionValue("jdbcdriver");
+ }
+ dbUtil = new JDBCUtil(commandLine.getOptionValue("jdbc"), driverClass);
+ } else {
+ System.err.println("Must specify either -db for the default in-memory h2 database\n" +
+ "or -jdbc for a full jdbc connection string");
+ USAGE();
+ return;
+ }
+ try (Connection c = dbUtil.getConnection()) {
+ Path tmpReportsFile = null;
+ try {
+ ResultsReporter resultsReporter = null;
+ String reportsFile = commandLine.getOptionValue("rf");
+ if (reportsFile == null) {
+ tmpReportsFile = getDefaultReportsConfig(c);
+ resultsReporter = ResultsReporter.build(tmpReportsFile);
+ } else {
+ resultsReporter = ResultsReporter.build(Paths.get(reportsFile));
+ }
+
+ Path reportsRootDirectory = Paths.get(commandLine.getOptionValue("rd", "reports"));
+ if (Files.isDirectory(reportsRootDirectory)) {
+ LOG.warn("'Reports' directory exists. Will overwrite existing reports.");
+ }
+
+ resultsReporter.execute(c, reportsRootDirectory);
+ } finally {
+ if (tmpReportsFile != null) {
+ Files.delete(tmpReportsFile);
+ }
+ }
+ }
+ }
+
+ private static Path getDefaultReportsConfig(Connection c) throws IOException, SQLException {
+ DatabaseMetaData md = c.getMetaData();
+ String internalPath = null;
+ try (ResultSet rs = md.getTables(null, null, "%", null)) {
+ while (rs.next()) {
+ String tName = rs.getString(3);
+ if (ExtractComparer.CONTENTS_TABLE_B.getName().equalsIgnoreCase(tName)) {
+ internalPath = "/comparison-reports.xml";
+ break;
+ } else if (ExtractProfiler.PROFILE_TABLE.getName().equalsIgnoreCase(tName)) {
+ internalPath = "/profile-reports.xml";
+ break;
+ }
+ }
+ }
+
+ if (internalPath == null) {
+ throw new RuntimeException("Couldn't determine if this database was a 'profiler' or 'comparison' db");
+ }
+ Path tmp = Files.createTempFile("tmp-tika-reports", ".xml");
+ Files.copy(ResultsReporter.class.getResourceAsStream(internalPath), tmp, StandardCopyOption.REPLACE_EXISTING);
+ return tmp;
+ }
+
+ public void execute(Connection c, Path reportsDirectory) throws IOException, SQLException {
+ Statement st = c.createStatement();
+ for (String sql : before) {
+ LOG.info("processing before: {}", sql);
+ st.execute(sql);
+ }
+ for (Report r : reports) {
+ r.writeReport(c, reportsDirectory);
+ }
+ for (String sql : after) {
+ LOG.info("processing after: {}", sql);
+ st.execute(sql);
+ }
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXHREFFormatter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXHREFFormatter.java
new file mode 100644
index 0000000..dd4f52a
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXHREFFormatter.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.reports;
+
+import java.nio.file.Paths;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import org.apache.poi.common.usermodel.HyperlinkType;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.ss.usermodel.Font;
+import org.apache.poi.ss.usermodel.Hyperlink;
+import org.apache.poi.ss.usermodel.IndexedColors;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+
+public class XLSXHREFFormatter implements XSLXCellFormatter {
+ //xlsx files can only have this many hyperlinks
+ //if they have more Excel can't read the file
+ private static final int MAX_HYPERLINKS = 65000;
+
+
+ private final String urlBase;
+ private final HyperlinkType linkType;
+ private XSSFWorkbook workbook;
+ private CellStyle style;
+ private int links = 0;
+
+ public XLSXHREFFormatter(String urlBase,
+ HyperlinkType hyperlinkType) {
+ this.urlBase = urlBase;
+ this.linkType = hyperlinkType;
+ }
+
+ @Override
+ public void reset(XSSFWorkbook workbook) {
+ this.workbook = workbook;
+ style = workbook.createCellStyle();
+ Font hlinkFont = workbook.createFont();
+ hlinkFont.setUnderline(Font.U_SINGLE);
+ hlinkFont.setColor(IndexedColors.BLUE.getIndex());
+ style.setFont(hlinkFont);
+ links = 0;
+
+ }
+
+ @Override
+ public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException {
+ if (links < MAX_HYPERLINKS) {
+ Hyperlink hyperlink = workbook.getCreationHelper().createHyperlink(linkType);
+ String path = resultSet.getString(dbColNum);
+ String address = urlBase+path;
+ hyperlink.setAddress(address);
+ cell.setHyperlink(hyperlink);
+ cell.setCellStyle(style);
+ String fName = Paths.get(path).getFileName().toString();
+ cell.setCellValue(fName);
+ links++;
+ } else {
+ //silently stop adding hyperlinks
+ }
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXNumFormatter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXNumFormatter.java
new file mode 100644
index 0000000..09c9712
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XLSXNumFormatter.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.app.reports;
+
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+class XLSXNumFormatter implements XSLXCellFormatter {
+
+ private final String formatString;
+ private CellStyle style;
+
+ XLSXNumFormatter(String formatString) {
+ this.formatString = formatString;
+ }
+
+
+ @Override
+ public void reset(XSSFWorkbook workbook) {
+ style = workbook.createCellStyle();
+ style.setDataFormat(workbook.getCreationHelper()
+ .createDataFormat().getFormat(formatString));
+ }
+
+ @Override
+ public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException {
+ double d = resultSet.getDouble(dbColNum);
+ if (resultSet.wasNull()) {
+
+ } else {
+ cell.setCellStyle(style);
+ }
+ cell.setCellValue(resultSet.getDouble(dbColNum));
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XSLXCellFormatter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XSLXCellFormatter.java
new file mode 100644
index 0000000..e5d1d6c
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/XSLXCellFormatter.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.reports;
+
+
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+interface XSLXCellFormatter {
+
+ public void reset(XSSFWorkbook workbook);
+ public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException;
+}
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java
new file mode 100644
index 0000000..db792cb
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/BatchTopCommonTokenCounter.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.utils.ProcessUtils;
+
+/**
+ * Utility class that runs TopCommonTokenCounter against a directory
+ * of table files (named {lang}_table.gz or leipzip-like afr_...-sentences.txt)
+ * and outputs common tokens files for each input table file in the output directory.
+ */
+public class BatchTopCommonTokenCounter {
+
+ public static void main(String[] args) throws Exception {
+
+ Path commonTokensDir = Paths.get(args[0]);
+ Path tableFileDir = Paths.get(args[1]);
+ Map<String, List<Path>> langFiles = LeipzigHelper.getFiles(tableFileDir);
+
+ for (Map.Entry<String, List<Path>> e : langFiles.entrySet()) {
+
+ String[] cmd = new String[e.getValue().size()+1];
+ Path commonTokensFile = commonTokensDir.resolve(e.getKey());
+ cmd[0] = ProcessUtils.escapeCommandLine(commonTokensFile.toAbsolutePath().toString());
+ for (int i = 0; i < e.getValue().size(); i++) {
+ cmd[i+1] =
+ ProcessUtils.escapeCommandLine(
+ e.getValue().get(i).toAbsolutePath().toString());
+ }
+ TopCommonTokenCounter.main(
+ cmd
+ );
+ }
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/CommonTokenOverlapCounter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/CommonTokenOverlapCounter.java
new file mode 100644
index 0000000..0221297
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/CommonTokenOverlapCounter.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import org.apache.tika.eval.core.tokens.CommonTokenCountManager;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+public class CommonTokenOverlapCounter {
+
+ public static void main(String[] args) throws Exception {
+ Path commonTokensDir = Paths.get(args[0]);
+ CommonTokenOverlapCounter counter = new CommonTokenOverlapCounter();
+ counter.execute(commonTokensDir);
+ }
+
+ private void execute(Path commonTokensDir) throws IOException {
+ List<String> langs = new ArrayList<>();
+ for (File f : commonTokensDir.toFile().listFiles()) {
+ langs.add(f.getName());
+ }
+ CommonTokenCountManager mgr = new CommonTokenCountManager(commonTokensDir, "");
+ for (int i = 0; i < langs.size()-1; i++) {
+ for (int j = i+1; j < langs.size(); j++) {
+ compare(langs.get(i), langs.get(j), mgr);
+ }
+ }
+ }
+
+ private void compare(String langA, String langB, CommonTokenCountManager mgr) {
+ int overlap = 0;
+ int denom = 0;
+ Set<String> setA = mgr.getTokens(langA);
+ Set<String> setB = mgr.getTokens(langB);
+ for (String a : setA) {
+ if (setB.contains(a)) {
+ overlap += 2;
+ }
+ }
+ denom = setA.size()+setB.size();
+ double percent = (double)overlap/(double)denom;
+ if (percent > 0.01) {
+ System.out.println(String.format(Locale.US, "%s %s %.2f", langA, langB, percent));
+ }
+ }
+
+
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigHelper.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigHelper.java
new file mode 100644
index 0000000..bec0a05
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigHelper.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class LeipzigHelper {
+
+ static Map<String, List<Path>> getFiles(Path leipzigDir) throws IOException {
+ Matcher tableMatcher = Pattern.compile("([a-z]+)_table(\\.txt)?(\\.gz)?$").matcher("");
+ Matcher leipzigMatcher = Pattern.compile("([a-z]{3,3})[-_].*$").matcher("");
+
+ Map<String, List<Path>> m = new TreeMap<>();
+ for (File f : leipzigDir.toFile().listFiles()) {
+ System.err.println(f);
+ String lang = null;
+ if (tableMatcher.reset(f.getName()).find()) {
+ lang = tableMatcher.group(1);
+ } else if (leipzigMatcher.reset(f.getName()).find()) {
+ lang = leipzigMatcher.group(1);
+ }
+ if (lang == null) {
+ System.err.println("couldn't find a lang: " + f);
+ continue;
+ }
+ List<Path> files = m.get(lang);
+ if (files == null) {
+ files = new ArrayList<>();
+ }
+ files.add(f.toPath());
+ m.put(lang, files);
+ }
+ return m;
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigSampler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigSampler.java
new file mode 100644
index 0000000..b49afe7
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/LeipzigSampler.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+public class LeipzigSampler {
+
+ public static void main(String[] args) throws Exception {
+ int sentsPerLanguage = 10;
+ Path leipzigDir = Paths.get(args[0]);
+ Path sampleFile = Paths.get(args[1]);
+ LeipzigSampler leipzigSampler = new LeipzigSampler();
+ try (BufferedWriter writer = Files.newBufferedWriter(sampleFile, StandardCharsets.UTF_8)) {
+ leipzigSampler.execute(leipzigDir, sentsPerLanguage, writer);
+ }
+ }
+
+ private void execute(Path leipzigDir, int sentsPerLang, BufferedWriter writer) throws IOException {
+ Map<String, List<Path>> fileMap = LeipzigHelper.getFiles(leipzigDir);
+ for (Map.Entry<String, List<Path>> e : fileMap.entrySet()) {
+ List<String> sentences = new ArrayList<>();
+ for (Path p : e.getValue()) {
+ addSentences(p, sentences);
+ }
+ Collections.shuffle(sentences);
+ String lang = e.getKey();
+ for (int i = 0; i < sentsPerLang; i++) {
+ writer.write(row(lang, sentences.get(i)));
+ }
+ }
+ }
+
+ private void addSentences(Path p, List<String> sentences) throws IOException {
+ try (BufferedReader reader = Files.newBufferedReader(p, StandardCharsets.UTF_8)) {
+ String line = reader.readLine();
+ while (line != null) {
+ int tab = line.indexOf("\t");
+ if (tab > -1) {
+ line = line.substring(tab + 1);
+ }
+ sentences.add(line);
+ line = reader.readLine();
+ }
+ }
+ }
+
+ private String row(String lang, String s) {
+ s = s.replaceAll("\\s+", " ");
+ return lang+"\t"+s+"\n";
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
new file mode 100644
index 0000000..181f329
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/SlowCompositeReaderWrapper.java
@@ -0,0 +1,327 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
+
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.CompositeReader;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafMetaData;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.MultiBits;
+import org.apache.lucene.index.MultiDocValues;
+import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
+import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.MultiTerms;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.OrdinalMap;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.StoredFieldVisitor;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.Version;
+
+/**
+ * <b>COPIED VERBATIM FROM LUCENE</b>
+ * This class forces a composite reader (eg a {@link
+ * MultiReader} or {@link DirectoryReader}) to emulate a
+ * {@link LeafReader}. This requires implementing the postings
+ * APIs on-the-fly, using the static methods in {@link
+ * MultiTerms}, {@link MultiDocValues}, by stepping through
+ * the sub-readers to merge fields/terms, appending docs, etc.
+ *
+ * <p><b>NOTE</b>: this class almost always results in a
+ * performance hit. If this is important to your use case,
+ * you'll get better performance by gathering the sub readers using
+ * {@link IndexReader#getContext()} to get the
+ * leaves and then operate per-LeafReader,
+ * instead of using this class.
+ */
+
+public final class SlowCompositeReaderWrapper extends LeafReader {
+
+ private final CompositeReader in;
+ private final LeafMetaData metaData;
+
+ // Cached copy of FieldInfos to prevent it from being re-created on each
+ // getFieldInfos call. Most (if not all) other LeafReader implementations
+ // also have a cached FieldInfos instance so this is consistent. SOLR-12878
+ private final FieldInfos fieldInfos;
+
+ final Map<String, Terms> cachedTerms = new ConcurrentHashMap<>();
+
+ // TODO: consider ConcurrentHashMap ?
+ // TODO: this could really be a weak map somewhere else on the coreCacheKey,
+ // but do we really need to optimize slow-wrapper any more?
+ final Map<String, OrdinalMap> cachedOrdMaps = new HashMap<>();
+
+ /** This method is sugar for getting an {@link LeafReader} from
+ * an {@link IndexReader} of any kind. If the reader is already atomic,
+ * it is returned unchanged, otherwise wrapped by this class.
+ */
+ public static LeafReader wrap(IndexReader reader) throws IOException {
+ if (reader instanceof CompositeReader) {
+ return new SlowCompositeReaderWrapper((CompositeReader) reader);
+ } else {
+ assert reader instanceof LeafReader;
+ return (LeafReader) reader;
+ }
+ }
+
+ SlowCompositeReaderWrapper(CompositeReader reader) throws IOException {
+ in = reader;
+ in.registerParentReader(this);
+ if (reader.leaves().isEmpty()) {
+ metaData = new LeafMetaData(Version.LATEST.major, Version.LATEST, null);
+ } else {
+ Version minVersion = Version.LATEST;
+ for (LeafReaderContext leafReaderContext : reader.leaves()) {
+ Version leafVersion = leafReaderContext.reader().getMetaData().getMinVersion();
+ if (leafVersion == null) {
+ minVersion = null;
+ break;
+ } else if (minVersion.onOrAfter(leafVersion)) {
+ minVersion = leafVersion;
+ }
+ }
+ metaData = new LeafMetaData(reader.leaves().get(0).reader().getMetaData().getCreatedVersionMajor(), minVersion, null);
+ }
+ fieldInfos = FieldInfos.getMergedFieldInfos(in);
+ }
+
+ @Override
+ public String toString() {
+ return "SlowCompositeReaderWrapper(" + in + ")";
+ }
+
+ @Override
+ public CacheHelper getReaderCacheHelper() {
+ return in.getReaderCacheHelper();
+ }
+
+ @Override
+ public CacheHelper getCoreCacheHelper() {
+ // TODO: this is trappy as the expectation is that core keys live for a long
+ // time, but here we need to bound it to the lifetime of the wrapped
+ // composite reader? Unfortunately some features seem to rely on this...
+ return in.getReaderCacheHelper();
+ }
+
+ @Override
+ public Terms terms(String field) throws IOException {
+ ensureOpen();
+ try {
+ return cachedTerms.computeIfAbsent(field, f -> {
+ try {
+ return MultiTerms.getTerms(in, f);
+ } catch (IOException e) { // yuck! ...sigh... checked exceptions with built-in lambdas are a pain
+ throw new RuntimeException("unwrapMe", e);
+ }
+ });
+ } catch (RuntimeException e) {
+ if (e.getMessage().equals("unwrapMe") && e.getCause() instanceof IOException) {
+ throw (IOException) e.getCause();
+ }
+ throw e;
+ }
+ }
+
+ @Override
+ public NumericDocValues getNumericDocValues(String field) throws IOException {
+ ensureOpen();
+ return MultiDocValues.getNumericValues(in, field); // TODO cache?
+ }
+
+ @Override
+ public BinaryDocValues getBinaryDocValues(String field) throws IOException {
+ ensureOpen();
+ return MultiDocValues.getBinaryValues(in, field); // TODO cache?
+ }
+
+ @Override
+ public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
+ ensureOpen();
+ return MultiDocValues.getSortedNumericValues(in, field); // TODO cache?
+ }
+
+ @Override
+ public SortedDocValues getSortedDocValues(String field) throws IOException {
+ ensureOpen();
+ OrdinalMap map = null;
+ synchronized (cachedOrdMaps) {
+ map = cachedOrdMaps.get(field);
+ if (map == null) {
+ // uncached, or not a multi dv
+ SortedDocValues dv = MultiDocValues.getSortedValues(in, field);
+ if (dv instanceof MultiSortedDocValues) {
+ map = ((MultiSortedDocValues)dv).mapping;
+ CacheHelper cacheHelper = getReaderCacheHelper();
+ if (cacheHelper != null && map.owner == cacheHelper.getKey()) {
+ cachedOrdMaps.put(field, map);
+ }
+ }
+ return dv;
+ }
+ }
+ int size = in.leaves().size();
+ final SortedDocValues[] values = new SortedDocValues[size];
+ final int[] starts = new int[size+1];
+ long totalCost = 0;
+ for (int i = 0; i < size; i++) {
+ LeafReaderContext context = in.leaves().get(i);
+ final LeafReader reader = context.reader();
+ final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
+ if (fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED) {
+ return null;
+ }
+ SortedDocValues v = reader.getSortedDocValues(field);
+ if (v == null) {
+ v = DocValues.emptySorted();
+ }
+ totalCost += v.cost();
+ values[i] = v;
+ starts[i] = context.docBase;
+ }
+ starts[size] = maxDoc();
+ return new MultiSortedDocValues(values, starts, map, totalCost);
+ }
+
+ @Override
+ public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
+ ensureOpen();
+ OrdinalMap map = null;
+ synchronized (cachedOrdMaps) {
+ map = cachedOrdMaps.get(field);
+ if (map == null) {
+ // uncached, or not a multi dv
+ SortedSetDocValues dv = MultiDocValues.getSortedSetValues(in, field);
+ if (dv instanceof MultiDocValues.MultiSortedSetDocValues) {
+ map = ((MultiDocValues.MultiSortedSetDocValues)dv).mapping;
+ CacheHelper cacheHelper = getReaderCacheHelper();
+ if (cacheHelper != null && map.owner == cacheHelper.getKey()) {
+ cachedOrdMaps.put(field, map);
+ }
+ }
+ return dv;
+ }
+ }
+
+ assert map != null;
+ int size = in.leaves().size();
+ final SortedSetDocValues[] values = new SortedSetDocValues[size];
+ final int[] starts = new int[size+1];
+ long cost = 0;
+ for (int i = 0; i < size; i++) {
+ LeafReaderContext context = in.leaves().get(i);
+ final LeafReader reader = context.reader();
+ final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
+ if(fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED_SET){
+ return null;
+ }
+ SortedSetDocValues v = reader.getSortedSetDocValues(field);
+ if (v == null) {
+ v = DocValues.emptySortedSet();
+ }
+ values[i] = v;
+ starts[i] = context.docBase;
+ cost += v.cost();
+ }
+ starts[size] = maxDoc();
+ return new MultiDocValues.MultiSortedSetDocValues(values, starts, map, cost);
+ }
+
+ @Override
+ public NumericDocValues getNormValues(String field) throws IOException {
+ ensureOpen();
+ return MultiDocValues.getNormValues(in, field); // TODO cache?
+ }
+
+ @Override
+ public Fields getTermVectors(int docID) throws IOException {
+ ensureOpen();
+ return in.getTermVectors(docID);
+ }
+
+ @Override
+ public int numDocs() {
+ // Don't call ensureOpen() here (it could affect performance)
+ return in.numDocs();
+ }
+
+ @Override
+ public int maxDoc() {
+ // Don't call ensureOpen() here (it could affect performance)
+ return in.maxDoc();
+ }
+
+ @Override
+ public void document(int docID, StoredFieldVisitor visitor) throws IOException {
+ ensureOpen();
+ in.document(docID, visitor);
+ }
+
+ @Override
+ public Bits getLiveDocs() {
+ ensureOpen();
+ return MultiBits.getLiveDocs(in); // TODO cache?
+ }
+
+ @Override
+ public PointValues getPointValues(String field) {
+ ensureOpen();
+ return null; // because not supported. Throw UOE?
+ }
+
+ @Override
+ public FieldInfos getFieldInfos() {
+ return fieldInfos;
+ }
+
+ @Override
+ protected void doClose() throws IOException {
+ // TODO: as this is a wrapper, should we really close the delegate?
+ in.close();
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ ensureOpen();
+ for (LeafReaderContext ctx : in.leaves()) {
+ ctx.reader().checkIntegrity();
+ }
+ }
+
+ @Override
+ public LeafMetaData getMetaData() {
+ return metaData;
+ }
+}
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java
new file mode 100644
index 0000000..e632346
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TopCommonTokenCounter.java
@@ -0,0 +1,401 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.io.FileUtils;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.tika.eval.core.tokens.AnalyzerManager;
+import org.apache.tika.eval.core.tokens.URLEmailNormalizingFilterFactory;
+import org.apache.tika.utils.ProcessUtils;
+
+/**
+ * Utility class that reads in a UTF-8 input file with one document per row
+ * and outputs the 20000 tokens with the highest document frequencies.
+ *
+ * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
+ * but includes bigrams for cjk.
+ *
+ * It also has a include list for __email__ and __url__ and a skip list
+ * for common html markup terms.
+ */
+public class TopCommonTokenCounter {
+
+ private static String LICENSE =
+ "# Licensed to the Apache Software Foundation (ASF) under one or more\n" +
+ "# contributor license agreements. See the NOTICE file distributed with\n" +
+ "# this work for additional information regarding copyright ownership.\n" +
+ "# The ASF licenses this file to You under the Apache License, Version 2.0\n" +
+ "# (the \"License\"); you may not use this file except in compliance with\n" +
+ "# the License. You may obtain a copy of the License at\n" +
+ "#\n" +
+ "# http://www.apache.org/licenses/LICENSE-2.0\n" +
+ "#\n" +
+ "# Unless required by applicable law or agreed to in writing, software\n" +
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" +
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" +
+ "# See the License for the specific language governing permissions and\n" +
+ "# limitations under the License.\n"+
+ "#\n";
+
+ private static final String FIELD = "f";
+ private static int TOP_N = 30000;
+ private static int MIN_DOC_FREQ = 10;
+ //these should exist in every list
+ static Set<String> INCLUDE_LIST = new HashSet<>(Arrays.asList(
+ new String[] {
+ URLEmailNormalizingFilterFactory.URL,
+ URLEmailNormalizingFilterFactory.EMAIL
+ }
+ ));
+
+ //words to ignore
+ //these are common 4 letter html markup words that we do
+ //not want to count in case of failed markup processing.
+ //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
+ static Set<String> SKIP_LIST = new HashSet<>(Arrays.asList(
+ "span",
+ "table",
+ "href",
+ "head",
+ "title",
+ "body",
+ "html",
+ "tagname",
+ "lang",
+ "style",
+ "script",
+ "strong",
+ "blockquote",
+ "form",
+ "iframe",
+ "section",
+ "colspan",
+ "rowspan"
+ ));
+
+ public static void main(String[] args) throws Exception {
+ Path commonTokensFile = Paths.get(args[0]);
+ List<Path> inputFiles = new ArrayList<>();
+ for (int i = 1; i < args.length; i++) {
+ inputFiles.add(Paths.get(
+ ProcessUtils.unescapeCommandLine(args[i])));
+ }
+ TopCommonTokenCounter counter = new TopCommonTokenCounter();
+ if (Files.exists(commonTokensFile)) {
+ System.err.println(commonTokensFile.getFileName().toString()+
+ " exists. I'm skipping this.");
+ return;
+ }
+ counter.execute(commonTokensFile, inputFiles);
+ }
+
+ private void execute(Path commonTokensFile, List<Path> inputFiles) throws Exception {
+ Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
+ AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
+ long totalDocs = -1;
+ long sumDocFreqs = -1;
+ long sumTotalTermFreqs = -1;
+ long uniqueTerms = -1;
+ try (Directory directory = FSDirectory.open(luceneDir)) {
+
+ AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);
+
+ Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
+ IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
+ int maxLen = 1000000;
+ int len = 0;
+ try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
+ List<Document> docs = new ArrayList<>();
+ for (Path inputFile : inputFiles) {
+ //total hack
+ boolean isLeipzig = false;
+ if (inputFile.getFileName().toString().contains("-sentences.txt")) {
+ isLeipzig = true;
+ }
+ int lines = 0;
+ try (BufferedReader reader = getReader(inputFile)) {
+ String line = reader.readLine();
+ while (line != null) {
+ if (isLeipzig) {
+ int tab = line.indexOf("\t");
+ if (tab > -1) {
+ line = line.substring(tab+1);
+ }
+ }
+ len += line.length();
+ Document document = new Document();
+ document.add(new TextField(FIELD, line, Field.Store.NO));
+ docs.add(document);
+ if (len > maxLen) {
+ writer.addDocuments(docs);
+ docs.clear();
+ len = 0;
+ }
+ line = reader.readLine();
+ if (++lines % 100000 == 0) {
+ System.out.println("processed "+lines +
+ " for "+inputFile.getFileName()
+ + " :: "+ commonTokensFile.toAbsolutePath());
+ }
+ }
+ }
+ }
+ if (docs.size() > 0) {
+ writer.addDocuments(docs);
+ }
+ writer.commit();
+ writer.flush();
+ }
+
+ try (IndexReader reader = DirectoryReader.open(directory)) {
+ LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
+ totalDocs = wrappedReader.getDocCount(FIELD);
+ sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
+ sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);
+
+ Terms terms = wrappedReader.terms(FIELD);
+ TermsEnum termsEnum = terms.iterator();
+ BytesRef bytesRef = termsEnum.next();
+ int docsWThisField = wrappedReader.getDocCount(FIELD);
+ while (bytesRef != null) {
+ uniqueTerms++;
+ int df = termsEnum.docFreq();
+ long tf = termsEnum.totalTermFreq();
+ if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
+ bytesRef = termsEnum.next();
+ continue;
+ }
+
+ if (queue.top() == null || queue.size() < TOP_N ||
+ df >= queue.top().df) {
+ String t = bytesRef.utf8ToString();
+ if (! SKIP_LIST.contains(t)) {
+ queue.insertWithOverflow(new TokenDFTF(t, df, tf));
+ }
+
+ }
+ bytesRef = termsEnum.next();
+ }
+ }
+ } finally {
+ FileUtils.deleteDirectory(luceneDir.toFile());
+ }
+
+ writeTopN(commonTokensFile, totalDocs,
+ sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);
+
+
+ }
+
+ private BufferedReader getReader(Path inputFile) throws IOException {
+ InputStream is = Files.newInputStream(inputFile);
+ if (inputFile.toString().endsWith(".gz")) {
+ is = new GzipCompressorInputStream(is);
+ }
+ return new BufferedReader(
+ new InputStreamReader(is, StandardCharsets.UTF_8)
+ );
+ }
+
+ private static void writeTopN(Path path,
+ long totalDocs, long sumDocFreqs,
+ long sumTotalTermFreqs,
+ long uniqueTerms, AbstractTokenTFDFPriorityQueue queue) throws IOException {
+ if (Files.isRegularFile(path)) {
+ System.err.println("File "+path.getFileName() + " already exists. Skipping.");
+ return;
+ }
+ Files.createDirectories(path.getParent());
+ BufferedWriter writer =
+ Files.newBufferedWriter(path, StandardCharsets.UTF_8);
+ StringBuilder sb = new StringBuilder();
+ writer.write(LICENSE);
+ writer.write("#DOC_COUNT\t"+totalDocs+"\n");
+ writer.write("#SUM_DOC_FREQS\t"+sumDocFreqs+"\n");
+ writer.write("#SUM_TERM_FREQS\t"+sumTotalTermFreqs+"\n");
+ writer.write("#UNIQUE_TERMS\t"+uniqueTerms+"\n");
+ writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
+ //add these tokens no matter what
+ for (String t : INCLUDE_LIST) {
+ writer.write(t);
+ writer.newLine();
+ }
+ for (TokenDFTF tp : queue.getArray()) {
+ writer.write(getRow(sb, tp)+"\n");
+
+ }
+ writer.flush();
+ writer.close();
+ }
+
+ private static String getRow(StringBuilder sb, TokenDFTF tp) {
+ sb.setLength(0);
+ sb.append(clean(tp.token));
+ sb.append("\t").append(tp.df);
+ sb.append("\t").append(tp.tf);
+ return sb.toString();
+ }
+
+ private static String clean(String s) {
+ if (s == null) {
+ return "";
+ }
+ return s.replaceAll("\\s+", " ").trim();
+ }
+
+ private abstract class AbstractTokenTFDFPriorityQueue extends PriorityQueue<TokenDFTF> {
+
+ AbstractTokenTFDFPriorityQueue(int maxSize) {
+ super(maxSize);
+ }
+
+ public TokenDFTF[] getArray() {
+ TokenDFTF[] topN = new TokenDFTF[size()];
+ //now we reverse the queue
+ TokenDFTF term = pop();
+ int i = topN.length-1;
+ while (term != null && i > -1) {
+ topN[i--] = term;
+ term = pop();
+ }
+ return topN;
+ }
+ }
+
+ private class TokenDFTF {
+
+ final String token;
+ final int df;
+ final long tf;
+
+ public TokenDFTF(String token, int df, long tf) {
+ this.token = token;
+ this.df = df;
+ this.tf = tf;
+ }
+
+
+ public long getTF() {
+ return tf;
+ }
+
+ public int getDF() {
+ return df;
+ }
+
+ public String getToken() {
+ return token;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+
+ TokenDFTF tokenDFTF = (TokenDFTF) o;
+
+ if (df != tokenDFTF.df) {
+ return false;
+ }
+ if (tf != tokenDFTF.tf) {
+ return false;
+ }
+ return Objects.equals(token, tokenDFTF.token);
+ }
+
+ @Override
+ public int hashCode() {
+ int result = token != null ? token.hashCode() : 0;
+ result = 31 * result + df;
+ result = 31 * result + (int) (tf ^ (tf >>> 32));
+ return result;
+ }
+
+ @Override
+ public String toString() {
+ return "TokenDFTF{" +
+ "token='" + token + '\'' +
+ ", df=" + df +
+ ", tf=" + tf +
+ '}';
+ }
+ }
+
+ private class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {
+
+ TokenDFPriorityQueue(int maxSize) {
+ super(maxSize);
+ }
+
+ @Override
+ protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
+ if (arg0.df < arg1.df) {
+ return true;
+ } else if (arg0.df > arg1.df) {
+ return false;
+ }
+ return arg1.token.compareTo(arg0.token) < 0;
+ }
+
+ public TokenDFTF[] getArray() {
+ TokenDFTF[] topN = new TokenDFTF[size()];
+ //now we reverse the queue
+ TokenDFTF term = pop();
+ int i = topN.length-1;
+ while (term != null && i > -1) {
+ topN[i--] = term;
+ term = pop();
+ }
+ return topN;
+ }
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TrainTestSplit.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TrainTestSplit.java
new file mode 100644
index 0000000..9df2144
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/tools/TrainTestSplit.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.app.tools;
+
+import java.io.BufferedOutputStream;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
+
+public class TrainTestSplit {
+
+ private static String TRAINING = "train";
+ private static String TESTING = "test";
+ private static String DEVTEST = "devtest";
+
+ private float trainingP = 0.7f;
+ private float devTestP = 0.1f;
+ private float testP = 0.2f;
+ private Random random = new Random();
+
+ public static void main(String[] args) throws Exception {
+ Path leipzigDir = Paths.get(args[0]);
+ Path outputDir = Paths.get(args[1]);
+ TrainTestSplit splitter = new TrainTestSplit();
+ splitter.execute(leipzigDir, outputDir);
+ }
+
+ private void execute(Path leipzigDir, Path outputDir) throws Exception {
+ initOutDirs(outputDir);
+ for (File f : leipzigDir.toFile().listFiles()) {
+ if (f.isDirectory()) {
+ continue;
+ }
+ processFile(f, outputDir);
+ }
+ }
+
+ private void initOutDirs(Path outputDir) throws Exception {
+ for (String which : new String[] { TRAINING, DEVTEST, TESTING}) {
+ Path target = outputDir.resolve(which);
+ if (! Files.isDirectory(target)) {
+ Files.createDirectories(target);
+ }
+ }
+
+ }
+
+ private void processFile(File f, Path outputDir) throws Exception {
+ Map<String, BufferedWriter> writers = getWriters(outputDir, f);
+ System.err.println("working on "+f);
+ try (BufferedReader reader = Files.newBufferedReader(f.toPath(), StandardCharsets.UTF_8)) {
+ String line = reader.readLine();
+ while (line != null) {
+ float r = random.nextFloat();
+ if (r <= trainingP) {
+ writers.get(TRAINING).write(line + "\n");
+ } else if (r < trainingP + devTestP) {
+ writers.get(DEVTEST).write(line + "\n");
+ } else {
+ writers.get(TESTING).write(line+"\n");
+ }
+ line = reader.readLine();
+ }
+ }
+
+
+ for (Writer w : writers.values()) {
+ w.flush();
+ w.close();
+ }
+ }
+
+ private Map<String, BufferedWriter> getWriters(Path outputDir, File f) throws IOException {
+ Map<String, BufferedWriter> writers = new HashMap<>();
+ for (String which : new String[] { TRAINING, DEVTEST, TESTING}) {
+ writers.put(which, getWriter(outputDir, which, f));
+ }
+ return writers;
+ }
+
+ private BufferedWriter getWriter(Path outputDir, String which, File f) throws IOException {
+ OutputStream os = new GzipCompressorOutputStream(
+ new BufferedOutputStream(
+ Files.newOutputStream(outputDir.resolve(which).resolve(f.getName()+".gz"))));
+ return new BufferedWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8));
+ }
+}
diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
new file mode 100644
index 0000000..5bcf88e
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports-pg.xml
@@ -0,0 +1,1753 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<reports>
+
+
+ <before>
+
+ <sql>drop table if exists md5_multiples_tmp_a</sql>
+ <sql>create table md5_multiples_tmp_a (MD5, cnt)
+ as
+ select md5, count(1) as cnt
+ from profiles_a
+ where md5 is not null
+ group by md5
+ having count(1) > 1
+ order by cnt desc
+ </sql>
+
+ <sql>drop table if exists md5_multiples_tmp_b</sql>
+ <sql>create table md5_multiples_tmp_b (MD5, cnt)
+ as
+ select md5, count(1) cnt
+ from profiles_b
+ where md5 is not null
+ group by md5
+ having count(1) > 1
+ order by cnt desc
+ </sql>
+ <!-- build mime indexes -->
+
+ <sql>create index if not exists pa_m_idx
+ on profiles_a (mime_id);
+ </sql>
+
+ <sql>
+ create index if not exists pb_m_idx
+ on profiles_b (mime_id);
+ </sql>
+
+ <!-- build exceptions comparison table -->
+ <sql>drop table if exists exceptions_compared</sql>
+ <sql>
+ create table exceptions_compared (
+ mime_id_a integer,
+ mime_id_b integer,
+ total integer,
+ exc_cnt_a integer,
+ exc_cnt_b integer,
+ exc_prcnt_a float,
+ exc_prcnt_b float,
+ notes varchar(12)
+ );
+ </sql>
+ <sql>
+ insert into exceptions_compared (
+ select ma.mime_id, mb.mime_id, count(1) as total, 0, 0, 0.0, 0.0, ''
+ from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join mimes ma on pa.mime_id = ma.mime_id
+ join mimes mb on pb.mime_id = mb.mime_id
+ group by ma.mime_id, mb.mime_id
+ order by total desc );
+ </sql>
+
+ <sql>
+ update exceptions_compared ec set
+ exc_cnt_a = (
+ select count(1) as cnt
+ from exceptions_a ea
+ join profiles_a pa on ea.id=pa.id
+ join profiles_b pb on pb.id=pa.id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+ group by ma.mime_id, mb.mime_id);
+ </sql>
+ <sql>
+ update exceptions_compared ec set
+ exc_cnt_b = (
+ select count(1) as cnt
+ from exceptions_b eb
+ join profiles_b pb on eb.id=pb.id
+ join profiles_a pa on pa.id=pb.id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+ group by mb.mime_id, ma.mime_id);
+ </sql>
+ <sql>
+ update exceptions_compared
+ set exc_prcnt_a = cast(exc_cnt_a as decimal)/cast(total as decimal)
+ where total > 0;
+ </sql>
+ <sql>
+ update exceptions_compared
+ set exc_prcnt_b = cast(exc_cnt_b as decimal)/cast(total as decimal)
+ where total > 0;
+ </sql>
+
+ <sql>
+ update exceptions_compared
+ set notes = 'YAY!'
+ where total > 100 and (exc_prcnt_a-exc_prcnt_b) > 0.10;
+ </sql>
+ <sql>
+ update exceptions_compared
+ set notes = 'YIKES!'
+ where total > 100 and (exc_prcnt_b-exc_prcnt_a) > 0.10;
+ </sql>
+
+ <!-- build tmp common words table -->
+ <sql>drop table if exists token_counts_compared</sql>
+ <sql>
+ create table token_counts_compared
+ (mime_id_a integer,
+ mime_id_b integer,
+ num_tokens_a bigint default 0,
+ num_tokens_b bigint default 0,
+ num_alphabetic_tokens_a bigint default 0,
+ num_alphabetic_tokens_b bigint default 0,
+ num_common_tokens_a bigint default 0,
+ num_common_tokens_b bigint default 0
+ );
+ </sql>
+ <sql>
+ insert into token_counts_compared (mime_id_a, mime_id_b)
+ select ma.mime_id, mb.mime_id
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
+
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_tokens_a=(
+ select sum(num_tokens) as cnt from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join contents_a c on c.id = pa.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_tokens_b=(
+ select sum(num_tokens) as cnt from profiles_b pb
+ join profiles_a pa on pa.id=pb.id
+ join contents_b c on c.id = pb.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_alphabetic_tokens_a=(
+ select sum(num_alphabetic_tokens) as cnt from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join contents_a c on c.id = pa.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_alphabetic_tokens_b=(
+ select sum(num_alphabetic_tokens) as cnt from profiles_b pb
+ join profiles_a pa on pb.id=pa.id
+ join contents_b c on c.id = pb.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_common_tokens_a=(
+ select sum(num_common_tokens) as cnt from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join contents_a c on c.id = pa.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_common_tokens_b=(
+ select sum(num_common_tokens) as cnt from profiles_b pb
+ join profiles_a pa on pa.id=pb.id
+ join contents_b c on c.id = pb.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>drop table if exists tags_by_mime</sql>
+ <sql>create table tags_by_mime (
+ mime_id_a integer,
+ mime_id_b integer,
+ tags_a_a integer,
+ tags_b_a integer,
+ tags_div_a integer,
+ tags_i_a integer,
+ tags_img_a integer,
+ tags_li_a integer,
+ tags_ol_a integer,
+ tags_p_a integer,
+ tags_table_a integer,
+ tags_td_a integer,
+ tags_title_a integer,
+ tags_tr_a integer,
+ tags_u_a integer,
+ tags_ul_a integer,
+ tags_a_b integer,
+ tags_b_b integer,
+ tags_div_b integer,
+ tags_i_b integer,
+ tags_img_b integer,
+ tags_li_b integer,
+ tags_ol_b integer,
+ tags_p_b integer,
+ tags_table_b integer,
+ tags_td_b integer,
+ tags_title_b integer,
+ tags_tr_b integer,
+ tags_u_b integer,
+ tags_ul_b integer
+ );
+ </sql>
+ <sql>
+ insert into tags_by_mime (mime_id_a, mime_id_b)
+ select ma.mime_id, mb.mime_id
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_a_a=(
+ select sum(ta.tags_a) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_b_a=(
+ select sum(ta.tags_b) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_div_a=(
+ select sum(ta.tags_div) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_i_a=(
+ select sum(ta.tags_i) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_img_a=(
+ select sum(ta.tags_img) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_li_a=(
+ select sum(ta.tags_li) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ol_a=(
+ select sum(ta.tags_ol) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_p_a=(
+ select sum(ta.tags_p) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_table_a=(
+ select sum(ta.tags_table) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_td_a=(
+ select sum(ta.tags_td) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_title_a=(
+ select sum(ta.tags_title) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_tr_a=(
+ select sum(ta.tags_tr) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_u_a=(
+ select sum(ta.tags_u) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ul_a=(
+ select sum(ta.tags_ul) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <!-- now update tags_b counts -->
+ <sql>
+ update tags_by_mime tbm set tags_a_b=(
+ select sum(tb.tags_a) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_b_b=(
+ select sum(tb.tags_b) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_div_b=(
+ select sum(tb.tags_div) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_i_b=(
+ select sum(tb.tags_i) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_img_b=(
+ select sum(tb.tags_img) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_li_b=(
+ select sum(tb.tags_li) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ol_b=(
+ select sum(tb.tags_ol) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_p_b=(
+ select sum(tb.tags_p) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_table_b=(
+ select sum(tb.tags_table) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_td_b=(
+ select sum(tb.tags_td) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_title_b=(
+ select sum(tb.tags_title) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_tr_b=(
+ select sum(tb.tags_tr) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_u_b=(
+ select sum(tb.tags_u) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ul_b=(
+ select sum(tb.tags_ul) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>drop table if exists tag_exceptions_by_mime</sql>
+ <sql>create table tag_exceptions_by_mime (
+ mime_id_a integer,
+ mime_id_b integer,
+ tag_exceptions_a integer,
+ tag_exceptions_b integer)
+ </sql>
+ <sql>
+ insert into tag_exceptions_by_mime (mime_id_a, mime_id_b,
+ tag_exceptions_a, tag_exceptions_b)
+ select ma.mime_id, mb.mime_id,0,0
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
+ </sql>
+ <sql>
+ update tag_exceptions_by_mime tebm set tag_exceptions_a=(
+ select count(1) as cnt from tags_a ta
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tebm.mime_id_b
+ and pa.mime_id=tebm.mime_id_a
+ and ta.tags_parse_exception=true
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tag_exceptions_by_mime tebm set tag_exceptions_b=(
+ select count(1) as cnt from tags_b tb
+ join profiles_a pa on pa.id=tb.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tebm.mime_id_b
+ and pa.mime_id=tebm.mime_id_a
+ and tb.tags_parse_exception=true
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ drop table if exists parse_time_compared;
+ </sql>
+ <sql>
+ create table parse_time_compared (
+ mime_id_a integer,
+ mime_id_b integer,
+ total_a bigint,
+ total_b bigint,
+ prcnt_increase double precision
+ );
+ </sql>
+ <sql>
+ insert into parse_time_compared (mime_id_a, mime_id_b,
+ total_a, total_b, prcnt_increase)
+ select ma.mime_id, mb.mime_id,0,0,0.0
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
+ </sql>
+ <sql>
+ update parse_time_compared ptc set total_a=(
+ select sum(pa.elapsed_time_millis) as total_a from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ where pa.mime_id= ptc.mime_id_a
+ and pb.mime_id=ptc.mime_id_b
+ group by mime_id_a, mime_id_b)
+ </sql>
+ <sql>
+ update parse_time_compared ptc set total_b=(
+ select sum(pb.elapsed_time_millis) as total_b from profiles_b pb
+ join profiles_a pa on pa.id=pb.id
+ where pa.mime_id= ptc.mime_id_a
+ and pb.mime_id=ptc.mime_id_b
+ group by mime_id_a, mime_id_b)
+ </sql>
+ <sql>
+ update parse_time_compared ptc set prcnt_increase=(100.0 *
+ cast(total_b as decimal)/cast(total_a as decimal))
+ where total_a > 0;
+ </sql>
+ </before>
+
+ <!-- MIMES -->
+ <report reportName="All Mimes In A"
+ reportFilename="mimes/all_mimes_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_a p
+ join mimes m on m.mime_id = p.mime_id
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="All Mimes In B"
+ reportFilename="mimes/all_mimes_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_b p
+ join mimes m on m.mime_id = p.mime_id
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="Container Mimes In A"
+ reportFilename="mimes/container_mimes_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_a p
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=false
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="Container Mimes In B"
+ reportFilename="mimes/container_mimes_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_b p
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=false
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="Embedded Mimes In A"
+ reportFilename="mimes/embedded_mimes_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_a p
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=true
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="Embedded Mimes In B"
+ reportFilename="mimes/embedded_mimes_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_b p
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=true
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="Mime Differences A -> B"
+ reportFilename="mimes/mime_diffs_A_to_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select concat(ma.mime_string, ' -> ', mb.mime_string) as
+ MIME_A_TO_MIME_B, count(1) as COUNT
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ where a.mime_id <> b.mime_id
+ group by MIME_A_TO_MIME_B
+ order by COUNT DESC
+ </sql>
+ </report>
+
+ <report reportName="Mime Differences A -> B Details"
+ reportFilename="mimes/mime_diffs_A_to_B_details.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select concat(ma.mime_string, ' -> ', mb.mime_string) as
+ MIME_A_TO_MIME_B,
+ file_path,
+ c.length as CONTAINER_LENGTH,
+ a.file_name
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ join containers c on a.container_id=c.container_id
+ where a.mime_id <> b.mime_id
+ order by MIME_A_TO_MIME_B
+ </sql>
+ </report>
+
+
+ <!-- Exceptions -->
+ <report reportName="AllExceptionsByMimeA"
+ reportFilename="exceptions/exceptions_by_mime_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ exceptions_a e
+ join profiles_a p on p.id=e.id
+ join mimes m on m.mime_id = p.mime_id
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="AllExceptionsByMimeB"
+ reportFilename="exceptions/exceptions_by_mime_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ exceptions_b e
+ join profiles_b p on p.id=e.id
+ join mimes m on m.mime_id = p.mime_id
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="ContainerExceptionsByMimeA"
+ reportFilename="exceptions/container_exceptions_by_mime_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ exceptions_a e
+ join profiles_a p on p.id=e.id
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=false
+ and parse_exception_id=0
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="ContainerExceptionsByMimeB"
+ reportFilename="exceptions/container_exceptions_by_mime_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ exceptions_b e
+ join profiles_b p on p.id=e.id
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=false
+ and parse_exception_id=0
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="AllExceptionsByMimeByTypeA"
+ reportFilename="exceptions/exceptions_by_mime_by_type_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string as MIME_TYPE,
+ parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+ from exceptions_a e
+ join profiles_a p on p.id=e.id
+ join containers c on p.container_id=c.container_id
+ join mimes m on m.mime_id=p.mime_id
+ join ref_parse_exception_types r on
+ r.parse_exception_id=e.parse_exception_id
+ group by m.mime_string, parse_exception_description
+ order by MIME_TYPE, EXCEPTION_TYPE
+ </sql>
+ </report>
+
+ <report reportName="AllExceptionsByMimeByTypeB"
+ reportFilename="exceptions/exceptions_by_mime_by_type_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string as MIME_TYPE,
+ parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+ from exceptions_b e
+ join profiles_b p on p.id=e.id
+ join containers c on p.container_id=c.container_id
+ join mimes m on m.mime_id=p.mime_id
+ join ref_parse_exception_types r on
+ r.parse_exception_id=e.parse_exception_id
+ group by m.mime_string, parse_exception_description
+ order by MIME_TYPE, EXCEPTION_TYPE
+ </sql>
+ </report>
+
+ <report reportName="TextLostFromACausedByNewExceptionsInB"
+ reportFilename="exceptions/text_lost_from_A_caused_by_new_exceptions_in_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path as FILE_PATH,
+ c.length as CONTAINER_LENGTH,
+ ca.NUM_TOKENS as NUM_TOKENS_A,
+ cb.NUM_TOKENS as NUM_TOKENS_B,
+ ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A, cb.NUM_UNIQUE_TOKENS
+ as NUM_UNIQUE_TOKENS_B,
+ ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
+ ca.num_common_tokens as NUM_COMMON_TOKENS_A,
+ cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
+ cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+ ca.top_n_tokens as TOP_N_TOKENS_A, cb.top_n_tokens as TOP_N_TOKENS_B,
+ eb.ORIG_STACK_TRACE as ORIG_STACK_TRACE_B
+ from contents_a ca
+ join profiles_a pa on ca.id = pa.id
+ join containers c on pa.container_id=c.container_id
+ left join contents_b cb on ca.id=cb.id
+ left join exceptions_b eb on ca.id = eb.id
+ left join exceptions_a ea on ca.id = ea.id
+ where eb.orig_stack_trace is not null
+ and ea.orig_stack_trace is null
+ order by ca.num_common_tokens - coalesce(cb.num_common_tokens,0) desc
+ </sql>
+ </report>
+
+ <report reportName="FixedExceptionsInBByMimeType"
+ reportFilename="exceptions/fixed_exceptions_in_B_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select
+ ma.mime_string as MIME_TYPE_A,
+ mb.mime_string as MIME_TYPE_B,
+ count(1) as COUNT
+ from exceptions_a ea
+ left join exceptions_b eb on ea.id = eb.id
+ join profiles_a pa on pa.id=ea.id
+ join profiles_b pb on pa.id=pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where eb.id is null
+ and ea.parse_exception_id=0
+ group by mime_type_a, mime_type_b
+ </sql>
+ </report>
+
+ <report reportName="FixedExceptionsInByDetails"
+ reportFilename="exceptions/fixed_exceptions_in_B_details.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select
+ file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_TYPE_A,
+ mb.mime_string as MIME_TYPE_B,
+ pa.file_name, pa.is_embedded
+ from exceptions_a ea
+ left join exceptions_b eb on ea.id = eb.id
+ join profiles_a pa on pa.id=ea.id
+ join profiles_b pb on pb.id=pa.id --this ensures that files were actually processed in both runs
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where eb.id is null
+ and ea.parse_exception_id=0
+ order by mime_type_a, mime_type_b
+ </sql>
+ </report>
+ <report reportName="ContentsOfFixedExceptionsInB"
+ reportFilename="exceptions/contents_of_fixed_exceptions_in_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_TYPE_A,
+ mb.mime_string as MIME_TYPE_B,
+ CONTENT_LENGTH,
+ NUM_TOKENS, NUM_UNIQUE_TOKENS,
+ TOP_N_TOKENS, LANG_ID_1,TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV
+ from exceptions_a ea
+ left join exceptions_b eb on ea.id = eb.id
+ join profiles_a pa on pa.id=ea.id
+ join profiles_b pb on pa.id=pb.id
+ join contents_b cb on cb.id=ea.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where eb.id is null
+ and ea.parse_exception_id=0
+ </sql>
+ </report>
+
+ <report reportName="NewExceptionsByMimeType"
+ reportFilename="exceptions/new_exceptions_in_B_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select ma.mime_string as MIME_TYPE_A, mb.mime_string as MIME_TYPE_B, count(1) as COUNT
+ from exceptions_b eb
+ left join exceptions_a ea on ea.id = eb.id
+ join profiles_a pa on pa.id=eb.id
+ join profiles_b pb on pb.id=pa.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where ea.id is null
+ and eb.parse_exception_id=0
+ group by ma.mime_string, mb.mime_string
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="NewExceptionsInBByMimeTypeByStackTrace"
+ reportFilename="exceptions/new_exceptions_in_B_by_mime_by_stack_trace.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select
+ ma.MIME_STRING as MIME_TYPE_A,
+ mb.MIME_STRING as MIME_TYPE_B,
+ eb.sort_stack_trace, count(1) as
+ COUNT
+ from exceptions_b eb
+ left join exceptions_a ea on ea.id = eb.id
+ join profiles_a pa on pa.id=eb.id
+ join profiles_b pb on pb.id=eb.id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where ea.id is null
+ and eb.parse_exception_id=0
+ group by MIME_TYPE_A, MIME_TYPE_B, eb.sort_stack_trace
+ order by MIME_TYPE_A asc, MIME_TYPE_B asc, COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="NewExceptionsInBDetails"
+ reportFilename="exceptions/new_exceptions_in_B_details.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_TYPE_A,
+ mb.mime_string as MIME_TYPE_B,
+ eb.orig_stack_trace, eb.sort_stack_trace
+ from exceptions_b eb
+ left join exceptions_a ea on ea.id = eb.id
+ join profiles_a pa on pa.id=eb.id
+ join profiles_b pb on pb.id=eb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where ea.id is null
+ and eb.parse_exception_id=0
+ order by MIME_TYPE_A asc, MIME_TYPE_B asc, eb.ORIG_STACK_TRACE
+ </sql>
+ </report>
+
+ <report reportName="StackTracesByMimeInA"
+ reportFilename="exceptions/stack_traces_by_mime_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
+ COUNT
+ from exceptions_a e
+ join profiles_a p on p.id=e.id
+ join mimes m on m.mime_id=p.mime_id
+ and e.parse_exception_id=0
+ group by MIME_TYPE, e.sort_stack_trace
+ order by MIME_TYPE asc, COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="AllStackTracesInA"
+ reportFilename="exceptions/stack_traces_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ mime_string as MIME_TYPE,
+ orig_stack_trace, sort_stack_trace
+ from exceptions_a e
+ join profiles_a p on p.id=e.id
+ join containers c on p.container_id=c.container_id
+ join mimes m on m.mime_id=p.mime_id
+ and e.parse_exception_id=0
+ order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
+ CONTAINER_LENGTH asc
+ </sql>
+ </report>
+ <report reportName="AllStackTracesInB"
+ reportFilename="exceptions/stack_traces_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ mime_string as MIME_TYPE,
+ orig_stack_trace, sort_stack_trace
+ from exceptions_b e
+ join profiles_b p on p.id=e.id
+ join containers c on p.container_id=c.container_id
+ join mimes m on m.mime_id=p.mime_id
+ and e.parse_exception_id=0
+ order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
+ CONTAINER_LENGTH asc
+ </sql>
+ </report>
+
+ <report reportName="StackTracesByMimeInB"
+ reportFilename="exceptions/stack_traces_by_mime_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
+ COUNT
+ from exceptions_b e
+ join profiles_b p on p.id=e.id
+ join mimes m on m.mime_id=p.mime_id
+ and e.parse_exception_id=0
+ group by MIME_TYPE, e.sort_stack_trace
+ order by MIME_TYPE asc, COUNT desc
+ </sql>
+ </report>
+ <report reportName="extractExceptionsA"
+ reportFilename="exceptions/extract_exceptions_a.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select file_path, extract_exception_description
+ from extract_exceptions_a e
+ join ref_extract_exception_types t
+ on e.extract_exception_id=t.extract_exception_id
+ </sql>
+ </report>
+ <report reportName="extractExceptionsB"
+ reportFilename="exceptions/extract_exceptions_b.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select file_path, extract_exception_description
+ from extract_exceptions_b e
+ join ref_extract_exception_types t
+ on e.extract_exception_id=t.extract_exception_id
+ </sql>
+ </report>
+ <report reportName="parseExceptionTypesA"
+ reportFilename="exceptions/overall_exception_types_a.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select parse_exception_description, count(1)
+ from exceptions_a e
+ join ref_parse_exception_types t on
+ t.parse_exception_id=e.parse_exception_id
+ group by t.parse_exception_description
+ </sql>
+ </report>
+ <report reportName="parseExceptionTypesB"
+ reportFilename="exceptions/overall_exception_types_b.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select parse_exception_description, count(1)
+ from exceptions_b e
+ join ref_parse_exception_types t on
+ t.parse_exception_id=e.parse_exception_id
+ group by t.parse_exception_description
+ </sql>
+ </report>
+
+ <report reportName="contentDiffsWExceptions"
+ reportFilename="content/content_diffs_with_exceptions.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
+ ca.num_unique_tokens as NUM_UNIQUE_TOKENS_A,
+ cb.num_unique_tokens as NUM_UNIQUE_TOKENS_B,
+ ca.num_tokens as NUM_TOKENS_A,
+ cb.num_tokens as NUM_TOKENS_B,
+ ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
+ ca.num_common_tokens as NUM_COMMON_TOKENS_A,
+ cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
+ cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+ coalesce(cb.num_common_tokens,0)-
+ coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
+ ca.top_n_tokens as TOP_N_TOKENS_A,
+ cb.top_n_tokens as TOP_N_TOKENS_B,
+ ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
+ cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
+ top_10_unique_token_diffs_a,
+ top_10_unique_token_diffs_b,
+ top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap,
+ ref_ea.parse_exception_description as EXCEPTION_A,
+ ref_eb.parse_exception_description as EXCEPTION_B
+ from content_comparisons cc
+ join contents_a ca on ca.id=cc.id
+ left join contents_b cb on cb.id=cc.id
+ join profiles_a pa on pa.id = cc.id
+ join profiles_b pb on pb.id=cc.id
+ join containers c on c.container_id=pa.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ left join exceptions_a ea on ea.id=cc.id
+ left join exceptions_b eb on eb.id=cc.id
+ left join ref_parse_exception_types ref_ea on ref_ea.parse_exception_id=ea.parse_exception_id
+ left join ref_parse_exception_types ref_eb on ref_eb.parse_exception_id=eb.parse_exception_id
+ where (overlap < 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) >30)
+ and (ea.parse_exception_id is null or
+ ea.parse_exception_id <> 2)
+ and (eb.parse_exception_id is null or
+ eb.parse_exception_id <> 2)
+ order by ma.mime_string, overlap asc
+ limit 100000
+ </sql>
+ </report>
+ <report reportName="contentDiffsNoExceptions"
+ reportFilename="content/content_diffs_no_exceptions.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
+ ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A,
+ cb.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_B,
+ ca.NUM_TOKENS as NUM_TOKENS_A,
+ cb.NUM_TOKENS as NUM_TOKENS_B,
+ ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
+ ca.num_common_tokens as NUM_COMMON_TOKENS_A,
+ cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
+ cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+ coalesce(cb.num_common_tokens,0)-
+ coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
+ ca.top_n_tokens as TOP_N_TOKENS_A,
+ cb.top_n_tokens as TOP_N_TOKENS_B,
+ ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
+ cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
+ top_10_unique_token_diffs_a,
+ top_10_unique_token_diffs_b,
+ top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap
+ from content_comparisons cc
+ join contents_a ca on ca.id=cc.id
+ join contents_b cb on cb.id=cc.id
+ join profiles_a pa on pa.id = cc.id
+ join profiles_b pb on pb.id=cc.id
+ join containers c on c.container_id=pa.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ left join exceptions_a ea on ea.id=cc.id
+ left join exceptions_b eb on eb.id=cc.id
+ where (overlap < 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) >30)
+ and (ea.parse_exception_id is null)
+ and (eb.parse_exception_id is null)
+ order by ma.mime_string, overlap asc
+ limit 100000
+ </sql>
+ </report>
+
+ <report reportName="CommonTokenComparisonsByMimeType"
+ reportFilename="content/common_token_comparisons_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select ma.mime_string as MIME_STRING_A, mb.mime_string as MIME_STRING_B,
+ num_tokens_a, num_tokens_b,
+ num_alphabetic_tokens_a, num_alphabetic_tokens_b,
+ num_common_tokens_a, num_common_tokens_b,
+ coalesce(num_common_tokens_b, 0)-coalesce(num_common_tokens_a, 0) as change_in_common_tokens_b
+ from token_counts_compared tcc
+ join mimes ma on tcc.mime_id_a = ma.mime_id
+ join mimes mb on tcc.mime_id_b = mb.mime_id
+ order by change_in_common_tokens_b desc
+ </sql>
+ </report>
+ <report reportName="PageCountDiffs"
+ reportFilename="content/page_count_diffs.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
+ pa.num_pages as NUM_PAGES_A,
+ pb.num_pages as NUM_PAGES_B,
+ (pb.num_pages-pa.num_pages) as DIFF_NUM_PAGES_IN_B
+ from profiles_a pa
+ join profiles_b pb on pa.id = pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where pa.num_pages is not null
+ and pb.num_pages is not null
+ and pa.num_pages <> pb.num_pages
+ order by DIFF_NUM_PAGES_IN_B asc
+ limit 10000;
+ </sql>
+ </report>
+
+
+ <report reportName="ExceptionComparisonsByMimeType"
+ reportFilename="exceptions/exceptions_compared_by_mime_type.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select ma.mime_string as mime_string_a, mb.mime_string as mime_string_b,
+ total, exc_cnt_a,
+ exc_cnt_b,
+ exc_prcnt_a,
+ exc_prcnt_b, notes
+
+ from exceptions_compared e
+ join mimes ma on ma.mime_id=e.mime_id_a
+ join mimes mb on mb.mime_id=e.mime_id_b
+ order by (exc_prcnt_b-exc_prcnt_a) desc, total desc;
+ </sql>
+ </report>
+ <!-- <report reportName="MD5 Duplicate Counts A"
+ reportFilename="md5/md5_duplicate_counts_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select md5, count(1) cnt
+ from profiles_a
+ group by md5
+ having cnt > 2
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="MD5 Duplicate Counts B"
+ reportFilename="md5/md5_duplicate_counts_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select md5, count(1) cnt
+ from profiles_b
+ group by md5
+ having cnt > 2
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="MD5 Duplicates A"
+ reportFilename="md5/md5_duplicates_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5
+ from md5_multiples_tmp_a t
+ join profiles_a p on p.md5 = t.md5
+ join containers c on p.container_id = c.container_id
+ join contents_a cb on p.id=cb.id
+ order by t.cnt desc
+ </sql>
+ </report>
+
+ <report reportName="MD5 Duplicates B"
+ reportFilename="md5/md5_duplicates_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5
+ from md5_multiples_tmp_b t
+ join profiles_b p on p.md5 = t.md5
+ join containers c on p.container_id = c.container_id
+ join contents_b cb on p.id=cb.id
+ order by t.cnt desc
+ </sql>
+ </report>
+ -->
+
+ <report reportName="Attachment Diffs no Exceptions"
+ reportFilename="attachments/attachment_diffs_no_exceptions.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
+ pa.num_attachments as NUM_ATTACHMENTS_A,
+ pb.num_attachments as NUM_ATTACHMENTS_B,
+ pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B
+ from profiles_a pa
+ join profiles_b pb on pa.id= pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ left join exceptions_a ea on ea.id=pa.id
+ left join exceptions_b eb on eb.id=pb.id
+ where pa.is_embedded=false and
+ ea.parse_exception_id is null and
+ eb.parse_exception_id is null
+ and pa.num_attachments <> pb.num_attachments
+ order by ma.mime_string, pb.num_attachments-pa.num_attachments
+ limit 100000;
+ </sql>
+ </report>
+
+ <report reportName="Attachment Diffs with exceptions"
+ reportFilename="attachments/attachment_diffs_with_exceptions.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
+ pa.num_attachments as NUM_ATTACHMENTS_A,
+ pb.num_attachments as NUM_ATTACHMENTS_B,
+ pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B,
+ refea.parse_exception_description as PARSE_EXCEPTION_A,
+ refeb.parse_exception_description as PARSE_EXCEPTION_B
+ from profiles_a pa
+ join profiles_b pb on pa.id= pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ left join exceptions_a ea on ea.id=pa.id
+ left join exceptions_b eb on eb.id=pb.id
+ left join ref_parse_exception_types refea on ea.parse_exception_id=refea.parse_exception_id
+ left join ref_parse_exception_types refeb on eb.parse_exception_id=refeb.parse_exception_id
+ where pa.is_embedded=false
+ and pa.num_attachments <> pb.num_attachments
+ order by ma.mime_string, pb.num_attachments-pa.num_attachments
+ limit 100000;
+ </sql>
+ </report>
+
+ <report reportName="Files missing in B by Mime"
+ reportFilename="attachments/all_files_missing_in_B_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) as cnt
+ from profiles_a pa
+ left join profiles_b pb on pa.id=pb.id
+ join mimes m on pa.mime_id=m.mime_id
+ where pb.id is null
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="Container files missing in B by Mime"
+ reportFilename="attachments/container_files_missing_in_B_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) as cnt
+ from profiles_a pa
+ left join profiles_b pb on pa.id=pb.id
+ join mimes m on pa.mime_id=m.mime_id
+ where pb.id is null and pa.is_embedded=false
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="Embedded files missing in B by Mime"
+ reportFilename="attachments/embedded_files_missing_in_B_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) as cnt
+ from profiles_a pa
+ left join profiles_b pb on pa.id=pb.id
+ join mimes m on pa.mime_id=m.mime_id
+ where pb.id is null and pa.is_embedded=true
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="All files missing in A by Mime"
+ reportFilename="attachments/all_files_missing_in_A_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) as cnt
+ from profiles_b pb
+ left join profiles_a pa on pb.id=pa.id
+ join mimes m on pb.mime_id=m.mime_id
+ where pa.id is null
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="Container files missing in A by Mime"
+ reportFilename="attachments/container_files_missing_in_A_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) as cnt
+ from profiles_b pb
+ left join profiles_a pa on pb.id=pa.id
+ join mimes m on pb.mime_id=m.mime_id
+ where pa.id is null and pb.is_embedded=false
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="Embedded files missing in A by Mime"
+ reportFilename="attachments/embedded_files_missing_in_A_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) as cnt
+ from profiles_b pb
+ left join profiles_a pa on pb.id=pa.id
+ join mimes m on pb.mime_id=m.mime_id
+ where pa.id is null and pb.is_embedded=true
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <!-- metadata values -->
+ <report reportName="Metadata Value Diffs"
+ reportFilename="metadata/metadata_value_count_diffs.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ ma.mime_string as mime_string_a,
+ mb.mime_string as mime_string_b,
+ pa.num_metadata_values as num_metadata_values_a,
+ pb.num_metadata_values as num_metadata_values_b,
+ ea.parse_exception_id as parse_ex_id_a,
+ eb.parse_exception_id as parse_ex_id_b
+ from profiles_a pa
+ join profiles_b pb on pa.id= pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ left join exceptions_a ea on ea.id=pa.id
+ left join exceptions_b eb on eb.id=pb.id
+ where
+ ea.parse_exception_id is null and
+ eb.parse_exception_id is null
+ and pa.num_metadata_values <> pb.num_metadata_values
+ order by ma.mime_string,
+ pb.num_metadata_values-pa.num_metadata_values
+ limit 100000
+ </sql>
+ </report>
+ <report reportName="Tag Count Diffs By Mime"
+ reportFilename="tags/tag_count_diffs_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select ma.mime_string as mime_string_a,
+ mb.mime_string as mime_string_b,
+ tags_a_a,
+ tags_a_b,
+ tags_b_a,
+ tags_b_b,
+ tags_div_a,
+ tags_div_b,
+ tags_i_a,
+ tags_i_b,
+ tags_li_a,
+ tags_li_b,
+ tags_ol_a,
+ tags_ol_b,
+ tags_p_a,
+ tags_p_b,
+ tags_table_a,
+ tags_table_b,
+ tags_td_a,
+ tags_td_b,
+ tags_title_a,
+ tags_title_b,
+ tags_tr_a,
+ tags_tr_b,
+ tags_u_a,
+ tags_u_b,
+ tags_ul_a,
+ tags_ul_b
+ from
+ tags_by_mime tbm
+ join mimes ma on tbm.mime_id_a=ma.mime_id
+ join mimes mb on tbm.mime_id_b=mb.mime_id
+ limit 100000
+ </sql>
+
+ </report>
+ <report reportName="Tag Exceptions By Mime"
+ reportFilename="tags/tag_exceptions_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select ma.mime_string as mime_string_a,
+ mb.mime_string as mime_string_b,
+ tag_exceptions_a,
+ tag_exceptions_b,
+ (tag_exceptions_b-tag_exceptions_a) as diff_tag_exceptions_in_b
+ from tag_exceptions_by_mime tebm
+ join mimes ma on tebm.mime_id_a=ma.mime_id
+ join mimes mb on tebm.mime_id_b=mb.mime_id
+ order by diff_tag_exceptions_in_b desc
+ </sql>
+ </report>
+ <report reportName="Tag Exceptions Details A"
+ reportFilename="tags/tag_exceptions_details_a.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select c.file_path,pa.file_name,mime_string,is_embedded from
+ tags_a ta
+ join profiles_a pa on ta.id=pa.id
+ join containers c on pa.container_id=c.container_id
+ join mimes m on pa.mime_id=m.mime_id
+ where ta.tags_parse_exception=true
+ order by m.mime_string
+ limit 20000
+ </sql>
+ </report>
+ <report reportName="Tag Exceptions Details B"
+ reportFilename="tags/tag_exceptions_details_b.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select c.file_path,pb.file_name,mime_string,is_embedded from
+ tags_b tb
+ join profiles_b pb on tb.id=pb.id
+ join containers c on pb.container_id=c.container_id
+ join mimes m on pb.mime_id=m.mime_id
+ where tb.tags_parse_exception=true
+ order by m.mime_string
+ limit 20000
+ </sql>
+ </report>
+
+ <report reportName="Parse Time (Millis) Compared"
+ reportFilename="parse_times/parse_time_millis_by_mime_compared.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
+ total_a as TOTAL_MILLIS_A, total_b as TOTAL_MILLIS_B,
+ prcnt_increase as PERCENT_INCREASE
+ from parse_time_compared ptc
+ join mimes ma on ptc.mime_id_a=ma.mime_id
+ join mimes mb on ptc.mime_id_b=mb.mime_id
+ where TOTAL_A > 1000 AND TOTAL_B > 1000 -- only show comparisons if > a second
+ order by prcnt_increase desc
+ </sql>
+ </report>
+ <report reportName="Parse Time (Millis) Details"
+ reportFilename="parse_times/parse_time_millis_details.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select file_path, c.length as CONTAINTER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
+ pa.elapsed_time_millis as TOTAL_MILLIS_A,
+ pb.elapsed_time_millis as TOTAL_MILLIS_B,
+ (pb.elapsed_time_millis-pa.elapsed_time_millis) as DIFF_MILLIS
+ from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ join containers c on pa.container_id=c.container_id
+ order by DIFF_MILLIS desc
+ limit 20000;
+ </sql>
+ </report>
+ <after>
+ <sql>drop table if exists md5_multiples_tmp_a</sql>
+ <sql>drop table if exists md5_multiples_tmp_b</sql>
+ </after>
+</reports>
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
new file mode 100644
index 0000000..e23ec5e
--- /dev/null
+++ b/tika-eval/tika-eval-app/src/main/resources/comparison-reports.xml
@@ -0,0 +1,1753 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<reports>
+
+
+ <before>
+
+ <sql>drop table if exists md5_multiples_tmp_a</sql>
+ <sql>create table md5_multiples_tmp_a (MD5 char(32), cnt int)
+ as
+ select md5, count(1) cnt
+ from profiles_a
+ where md5 is not null
+ group by md5
+ having cnt > 1
+ order by cnt desc
+ </sql>
+
+ <sql>drop table if exists md5_multiples_tmp_b</sql>
+ <sql>create table md5_multiples_tmp_b (MD5 char(32), cnt int)
+ as
+ select md5, count(1) cnt
+ from profiles_b
+ where md5 is not null
+ group by md5
+ having cnt > 1
+ order by cnt desc
+ </sql>
+ <!-- build mime indexes -->
+
+ <sql>create index if not exists pa_m_idx
+ on profiles_a (mime_id);
+ </sql>
+
+ <sql>
+ create index if not exists pb_m_idx
+ on profiles_b (mime_id);
+ </sql>
+
+ <!-- build exceptions comparison table -->
+ <sql>drop table if exists exceptions_compared</sql>
+ <sql>
+ create table exceptions_compared (
+ mime_id_a integer,
+ mime_id_b integer,
+ total integer,
+ exc_cnt_a integer,
+ exc_cnt_b integer,
+ exc_prcnt_a float,
+ exc_prcnt_b float,
+ notes varchar(12)
+ );
+ </sql>
+ <sql>
+ insert into exceptions_compared (
+ select ma.mime_id, mb.mime_id, count(1) as total, 0, 0, 0.0, 0.0, ''
+ from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join mimes ma on pa.mime_id = ma.mime_id
+ join mimes mb on pb.mime_id = mb.mime_id
+ group by ma.mime_id, mb.mime_id
+ order by total desc );
+ </sql>
+
+ <sql>
+ update exceptions_compared ec set
+ exc_cnt_a = (
+ select count(1) as cnt
+ from exceptions_a ea
+ join profiles_a pa on ea.id=pa.id
+ join profiles_b pb on pb.id=pa.id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+ group by ma.mime_id, mb.mime_id);
+ </sql>
+ <sql>
+ update exceptions_compared ec set
+ exc_cnt_b = (
+ select count(1) as cnt
+ from exceptions_b eb
+ join profiles_b pb on eb.id=pb.id
+ join profiles_a pa on pa.id=pb.id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+ group by mb.mime_id, ma.mime_id);
+ </sql>
+ <sql>
+ update exceptions_compared
+ set exc_prcnt_a = cast(exc_cnt_a as decimal)/cast(total as decimal)
+ where total > 0;
+ </sql>
+ <sql>
+ update exceptions_compared
+ set exc_prcnt_b = cast(exc_cnt_b as decimal)/cast(total as decimal)
+ where total > 0;
+ </sql>
+
+ <sql>
+ update exceptions_compared
+ set notes = 'YAY!'
+ where total > 100 and (exc_prcnt_a-exc_prcnt_b) > 0.10;
+ </sql>
+ <sql>
+ update exceptions_compared
+ set notes = 'YIKES!'
+ where total > 100 and (exc_prcnt_b-exc_prcnt_a) > 0.10;
+ </sql>
+
+ <!-- build tmp common words table -->
+ <sql>drop table if exists token_counts_compared</sql>
+ <sql>
+ create table token_counts_compared
+ (mime_id_a integer,
+ mime_id_b integer,
+ num_tokens_a long default 0,
+ num_tokens_b long default 0,
+ num_alphabetic_tokens_a long default 0,
+ num_alphabetic_tokens_b long default 0,
+ num_common_tokens_a long default 0,
+ num_common_tokens_b long default 0
+ );
+ </sql>
+ <sql>
+ insert into token_counts_compared (mime_id_a, mime_id_b)
+ select ma.mime_id, mb.mime_id
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
+
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_tokens_a=(
+ select sum(num_tokens) as cnt from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join contents_a c on c.id = pa.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_tokens_b=(
+ select sum(num_tokens) as cnt from profiles_b pb
+ join profiles_a pa on pa.id=pb.id
+ join contents_b c on c.id = pb.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_alphabetic_tokens_a=(
+ select sum(num_alphabetic_tokens) as cnt from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join contents_a c on c.id = pa.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_alphabetic_tokens_b=(
+ select sum(num_alphabetic_tokens) as cnt from profiles_b pb
+ join profiles_a pa on pb.id=pa.id
+ join contents_b c on c.id = pb.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_common_tokens_a=(
+ select sum(num_common_tokens) as cnt from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join contents_a c on c.id = pa.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>
+ update token_counts_compared tcc set num_common_tokens_b=(
+ select sum(num_common_tokens) as cnt from profiles_b pb
+ join profiles_a pa on pa.id=pb.id
+ join contents_b c on c.id = pb.id
+ where pb.mime_id= tcc.mime_id_b
+ and pa.mime_id=tcc.mime_id_a
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+
+ <sql>drop table if exists tags_by_mime</sql>
+ <sql>create table tags_by_mime (
+ mime_id_a integer,
+ mime_id_b integer,
+ tags_a_a integer,
+ tags_b_a integer,
+ tags_div_a integer,
+ tags_i_a integer,
+ tags_img_a integer,
+ tags_li_a integer,
+ tags_ol_a integer,
+ tags_p_a integer,
+ tags_table_a integer,
+ tags_td_a integer,
+ tags_title_a integer,
+ tags_tr_a integer,
+ tags_u_a integer,
+ tags_ul_a integer,
+ tags_a_b integer,
+ tags_b_b integer,
+ tags_div_b integer,
+ tags_i_b integer,
+ tags_img_b integer,
+ tags_li_b integer,
+ tags_ol_b integer,
+ tags_p_b integer,
+ tags_table_b integer,
+ tags_td_b integer,
+ tags_title_b integer,
+ tags_tr_b integer,
+ tags_u_b integer,
+ tags_ul_b integer
+ );
+ </sql>
+ <sql>
+ insert into tags_by_mime (mime_id_a, mime_id_b)
+ select ma.mime_id, mb.mime_id
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_a_a=(
+ select sum(ta.tags_a) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_b_a=(
+ select sum(ta.tags_b) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_div_a=(
+ select sum(ta.tags_div) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_i_a=(
+ select sum(ta.tags_i) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_img_a=(
+ select sum(ta.tags_img) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_li_a=(
+ select sum(ta.tags_li) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ol_a=(
+ select sum(ta.tags_ol) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_p_a=(
+ select sum(ta.tags_p) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_table_a=(
+ select sum(ta.tags_table) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_td_a=(
+ select sum(ta.tags_td) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_title_a=(
+ select sum(ta.tags_title) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_tr_a=(
+ select sum(ta.tags_tr) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_u_a=(
+ select sum(ta.tags_u) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ul_a=(
+ select sum(ta.tags_ul) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <!-- now update tags_b counts -->
+ <sql>
+ update tags_by_mime tbm set tags_a_b=(
+ select sum(tb.tags_a) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_b_b=(
+ select sum(tb.tags_b) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_div_b=(
+ select sum(tb.tags_div) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_i_b=(
+ select sum(tb.tags_i) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_img_b=(
+ select sum(tb.tags_img) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_li_b=(
+ select sum(tb.tags_li) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ol_b=(
+ select sum(tb.tags_ol) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_p_b=(
+ select sum(tb.tags_p) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_table_b=(
+ select sum(tb.tags_table) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_td_b=(
+ select sum(tb.tags_td) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_title_b=(
+ select sum(tb.tags_title) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_tr_b=(
+ select sum(tb.tags_tr) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_u_b=(
+ select sum(tb.tags_u) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ul_b=(
+ select sum(tb.tags_ul) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>drop table if exists tag_exceptions_by_mime</sql>
+ <sql>create table tag_exceptions_by_mime (
+ mime_id_a integer,
+ mime_id_b integer,
+ tag_exceptions_a integer,
+ tag_exceptions_b integer)
+ </sql>
+ <sql>
+ insert into tag_exceptions_by_mime (mime_id_a, mime_id_b,
+ tag_exceptions_a, tag_exceptions_b)
+ select ma.mime_id, mb.mime_id,0,0
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
+ </sql>
+ <sql>
+ update tag_exceptions_by_mime tebm set tag_exceptions_a=(
+ select count(1) as cnt from tags_a ta
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tebm.mime_id_b
+ and pa.mime_id=tebm.mime_id_a
+ and ta.tags_parse_exception=true
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tag_exceptions_by_mime tebm set tag_exceptions_b=(
+ select count(1) as cnt from tags_b tb
+ join profiles_a pa on pa.id=tb.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tebm.mime_id_b
+ and pa.mime_id=tebm.mime_id_a
+ and tb.tags_parse_exception=true
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ drop table if exists parse_time_compared;
+ </sql>
+ <sql>
+ create table parse_time_compared (
+ mime_id_a integer,
+ mime_id_b integer,
+ total_a bigint,
+ total_b bigint,
+ prcnt_increase double
+ );
+ </sql>
+ <sql>
+ insert into parse_time_compared (mime_id_a, mime_id_b,
+ total_a, total_b, prcnt_increase)
+ select ma.mime_id, mb.mime_id,0,0,0.0
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
+ </sql>
+ <sql>
+ update parse_time_compared ptc set total_a=(
+ select sum(pa.elapsed_time_millis) as total_a from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ where pa.mime_id= ptc.mime_id_a
+ and pb.mime_id=ptc.mime_id_b
+ group by mime_id_a, mime_id_b)
+ </sql>
+ <sql>
+ update parse_time_compared ptc set total_b=(
+ select sum(pb.elapsed_time_millis) as total_b from profiles_b pb
+ join profiles_a pa on pa.id=pb.id
+ where pa.mime_id= ptc.mime_id_a
+ and pb.mime_id=ptc.mime_id_b
+ group by mime_id_a, mime_id_b)
+ </sql>
+ <sql>
+ update parse_time_compared ptc set prcnt_increase=(100.0 *
+ cast(total_b as decimal)/cast(total_a as decimal))
+ where total_a > 0;
+ </sql>
+ </before>
+
+ <!-- MIMES -->
+ <report reportName="All Mimes In A"
+ reportFilename="mimes/all_mimes_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_a p
+ join mimes m on m.mime_id = p.mime_id
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="All Mimes In B"
+ reportFilename="mimes/all_mimes_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_b p
+ join mimes m on m.mime_id = p.mime_id
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="Container Mimes In A"
+ reportFilename="mimes/container_mimes_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_a p
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=false
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="Container Mimes In B"
+ reportFilename="mimes/container_mimes_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_b p
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=false
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="Embedded Mimes In A"
+ reportFilename="mimes/embedded_mimes_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_a p
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=true
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="Embedded Mimes In B"
+ reportFilename="mimes/embedded_mimes_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ profiles_b p
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=true
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="Mime Differences A -> B"
+ reportFilename="mimes/mime_diffs_A_to_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select concat(ma.mime_string, ' -> ', mb.mime_string) as
+ MIME_A_TO_MIME_B, count(1) as COUNT
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ where a.mime_id <> b.mime_id
+ group by MIME_A_TO_MIME_B
+ order by COUNT DESC
+ </sql>
+ </report>
+
+ <report reportName="Mime Differences A -> B Details"
+ reportFilename="mimes/mime_diffs_A_to_B_details.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select concat(ma.mime_string, ' -> ', mb.mime_string) as
+ MIME_A_TO_MIME_B,
+ file_path,
+ c.length as CONTAINER_LENGTH,
+ a.file_name
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ join containers c on a.container_id=c.container_id
+ where a.mime_id <> b.mime_id
+ order by MIME_A_TO_MIME_B
+ </sql>
+ </report>
+
+
+ <!-- Exceptions -->
+ <report reportName="AllExceptionsByMimeA"
+ reportFilename="exceptions/exceptions_by_mime_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ exceptions_a e
+ join profiles_a p on p.id=e.id
+ join mimes m on m.mime_id = p.mime_id
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="AllExceptionsByMimeB"
+ reportFilename="exceptions/exceptions_by_mime_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ exceptions_b e
+ join profiles_b p on p.id=e.id
+ join mimes m on m.mime_id = p.mime_id
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="ContainerExceptionsByMimeA"
+ reportFilename="exceptions/container_exceptions_by_mime_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ exceptions_a e
+ join profiles_a p on p.id=e.id
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=false
+ and parse_exception_id=0
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+
+ <report reportName="ContainerExceptionsByMimeB"
+ reportFilename="exceptions/container_exceptions_by_mime_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) cnt from
+ exceptions_b e
+ join profiles_b p on p.id=e.id
+ join mimes m on m.mime_id = p.mime_id
+ where is_embedded=false
+ and parse_exception_id=0
+ group by mime_string
+ order by cnt desc
+ </sql>
+ </report>
+ <report reportName="AllExceptionsByMimeByTypeA"
+ reportFilename="exceptions/exceptions_by_mime_by_type_A.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string as MIME_TYPE,
+ parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+ from exceptions_a e
+ join profiles_a p on p.id=e.id
+ join containers c on p.container_id=c.container_id
+ join mimes m on m.mime_id=p.mime_id
+ join ref_parse_exception_types r on
+ r.parse_exception_id=e.parse_exception_id
+ group by p.mime_id, parse_exception_description
+ order by MIME_TYPE, EXCEPTION_TYPE
+ </sql>
+ </report>
+
+ <report reportName="AllExceptionsByMimeByTypeB"
+ reportFilename="exceptions/exceptions_by_mime_by_type_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string as MIME_TYPE,
+ parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+ from exceptions_b e
+ join profiles_b p on p.id=e.id
+ join containers c on p.container_id=c.container_id
+ join mimes m on m.mime_id=p.mime_id
+ join ref_parse_exception_types r on
+ r.parse_exception_id=e.parse_exception_id
+ group by p.mime_id, parse_exception_description
+ order by MIME_TYPE, EXCEPTION_TYPE
+ </sql>
+ </report>
+
+ <report reportName="TextLostFromACausedByNewExceptionsInB"
+ reportFilename="exceptions/text_lost_from_A_caused_by_new_exceptions_in_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path as FILE_PATH,
+ c.length as CONTAINER_LENGTH,
+ ca.NUM_TOKENS as NUM_TOKENS_A,
+ cb.NUM_TOKENS as NUM_TOKENS_B,
+ ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A, cb.NUM_UNIQUE_TOKENS
+ as NUM_UNIQUE_TOKENS_B,
+ ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
+ ca.num_common_tokens as NUM_COMMON_TOKENS_A,
+ cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
+ cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+ ca.top_n_tokens as TOP_N_TOKENS_A, cb.top_n_tokens as TOP_N_TOKENS_B,
+ eb.ORIG_STACK_TRACE as ORIG_STACK_TRACE_B
+ from contents_a ca
+ join profiles_a pa on ca.id = pa.id
+ join containers c on pa.container_id=c.container_id
+ left join contents_b cb on ca.id=cb.id
+ left join exceptions_b eb on ca.id = eb.id
+ left join exceptions_a ea on ca.id = ea.id
+ where eb.orig_stack_trace is not null
+ and ea.orig_stack_trace is null
+ order by ca.num_common_tokens - ifnull(cb.num_common_tokens,0) desc
+ </sql>
+ </report>
+
+ <report reportName="FixedExceptionsInBByMimeType"
+ reportFilename="exceptions/fixed_exceptions_in_B_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select
+ ma.mime_string as MIME_TYPE_A,
+ mb.mime_string as MIME_TYPE_B,
+ count(1) as COUNT
+ from exceptions_a ea
+ left join exceptions_b eb on ea.id = eb.id
+ join profiles_a pa on pa.id=ea.id
+ join profiles_b pb on pa.id=pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where eb.id is null
+ and ea.parse_exception_id=0
+ group by mime_type_a, mime_type_b
+ </sql>
+ </report>
+
+ <report reportName="FixedExceptionsInByDetails"
+ reportFilename="exceptions/fixed_exceptions_in_B_details.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select
+ file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_TYPE_A,
+ mb.mime_string as MIME_TYPE_B,
+ pa.file_name, pa.is_embedded
+ from exceptions_a ea
+ left join exceptions_b eb on ea.id = eb.id
+ join profiles_a pa on pa.id=ea.id
+ join profiles_b pb on pb.id=pa.id //this ensures that files were actually processed in both runs
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where eb.id is null
+ and ea.parse_exception_id=0
+ order by mime_type_a, mime_type_b
+ </sql>
+ </report>
+ <report reportName="ContentsOfFixedExceptionsInB"
+ reportFilename="exceptions/contents_of_fixed_exceptions_in_B.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_TYPE_A,
+ mb.mime_string as MIME_TYPE_B,
+ CONTENT_LENGTH,
+ NUM_TOKENS, NUM_UNIQUE_TOKENS,
+ TOP_N_TOKENS, LANG_ID_1,TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV
+ from exceptions_a ea
+ left join exceptions_b eb on ea.id = eb.id
+ join profiles_a pa on pa.id=ea.id
+ join profiles_b pb on pa.id=pb.id
+ join contents_b cb on cb.id=ea.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where eb.id is null
+ and ea.parse_exception_id=0
+ </sql>
+ </report>
+
+ <report reportName="NewExceptionsByMimeType"
+ reportFilename="exceptions/new_exceptions_in_B_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select ma.mime_string as MIME_TYPE_A, mb.mime_string as MIME_TYPE_B, count(1) as COUNT
+ from exceptions_b eb
+ left join exceptions_a ea on ea.id = eb.id
+ join profiles_a pa on pa.id=eb.id
+ join profiles_b pb on pb.id=pa.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where ea.id is null
+ and eb.parse_exception_id=0
+ group by ma.mime_string, mb.mime_string
+ order by COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="NewExceptionsInBByMimeTypeByStackTrace"
+ reportFilename="exceptions/new_exceptions_in_B_by_mime_by_stack_trace.xlsx"
+ format="xlsx"
+ includeSql="true">
... 15475 lines suppressed ...