You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/11/30 19:45:18 UTC
[tika] branch main updated: TIKA-3240 -- break tika-eval into
tika-eval-core and tika-eval-app for Tika 2.0.0
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 219338f TIKA-3240 -- break tika-eval into tika-eval-core and tika-eval-app for Tika 2.0.0
219338f is described below
commit 219338f0925ef55a4748906e14a757a05b94ce35
Author: tallison <ta...@apache.org>
AuthorDate: Mon Nov 30 14:44:46 2020 -0500
TIKA-3240 -- break tika-eval into tika-eval-core and tika-eval-app for Tika 2.0.0
---
tika-eval/pom.xml | 218 +--
.../org/apache/tika/eval/AbstractProfiler.java | 825 ---------
.../java/org/apache/tika/eval/EvalFilePaths.java | 118 --
.../java/org/apache/tika/eval/ExtractComparer.java | 538 ------
.../java/org/apache/tika/eval/ExtractProfiler.java | 281 ----
.../java/org/apache/tika/eval/FileProfiler.java | 195 ---
.../java/org/apache/tika/eval/TikaEvalCLI.java | 326 ----
.../org/apache/tika/eval/XMLErrorLogUpdater.java | 223 ---
.../apache/tika/eval/batch/DBConsumersManager.java | 103 --
.../tika/eval/batch/EvalConsumerBuilder.java | 240 ---
.../tika/eval/batch/EvalConsumersBuilder.java | 131 --
.../tika/eval/batch/ExtractComparerBuilder.java | 165 --
.../tika/eval/batch/ExtractProfilerBuilder.java | 122 --
.../tika/eval/batch/FileProfilerBuilder.java | 97 --
.../org/apache/tika/eval/db/AbstractDBBuffer.java | 77 -
.../main/java/org/apache/tika/eval/db/ColInfo.java | 127 --
.../main/java/org/apache/tika/eval/db/Cols.java | 117 --
.../java/org/apache/tika/eval/db/DBBuffer.java | 54 -
.../main/java/org/apache/tika/eval/db/H2Util.java | 89 -
.../java/org/apache/tika/eval/db/JDBCUtil.java | 316 ----
.../java/org/apache/tika/eval/db/MimeBuffer.java | 148 --
.../java/org/apache/tika/eval/db/TableInfo.java | 73 -
.../java/org/apache/tika/eval/io/DBWriter.java | 162 --
.../org/apache/tika/eval/io/ExtractReader.java | 243 ---
.../tika/eval/io/ExtractReaderException.java | 48 -
.../java/org/apache/tika/eval/io/IDBWriter.java | 31 -
.../org/apache/tika/eval/io/XMLLogMsgHandler.java | 26 -
.../java/org/apache/tika/eval/io/XMLLogReader.java | 115 --
.../java/org/apache/tika/eval/reports/Report.java | 198 ---
.../apache/tika/eval/reports/ResultsReporter.java | 313 ----
.../tika/eval/reports/XLSXHREFFormatter.java | 80 -
.../apache/tika/eval/reports/XLSXNumFormatter.java | 54 -
.../tika/eval/reports/XSLXCellFormatter.java | 30 -
.../eval/tools/BatchTopCommonTokenCounter.java | 54 -
.../tika/eval/tools/CommonTokenOverlapCounter.java | 69 -
.../org/apache/tika/eval/tools/LeipzigHelper.java | 57 -
.../org/apache/tika/eval/tools/LeipzigSampler.java | 76 -
.../eval/tools/SlowCompositeReaderWrapper.java | 327 ----
.../tika/eval/tools/TopCommonTokenCounter.java | 401 -----
.../org/apache/tika/eval/tools/TrainTestSplit.java | 114 --
.../src/main/resources/comparison-reports-pg.xml | 1753 --------------------
.../src/main/resources/comparison-reports.xml | 1753 --------------------
tika-eval/src/main/resources/db.properties | 23 -
tika-eval/src/main/resources/profile-reports.xml | 329 ----
.../main/resources/tika-eval-comparison-config.xml | 102 --
.../resources/tika-eval-file-profiler-config.xml | 74 -
.../main/resources/tika-eval-profiler-config.xml | 96 --
.../test/java/org/apache/tika/MockDBWriter.java | 82 -
.../org/apache/tika/eval/AnalyzerManagerTest.java | 100 --
.../org/apache/tika/eval/ComparerBatchTest.java | 399 -----
.../org/apache/tika/eval/ProfilerBatchTest.java | 235 ---
.../org/apache/tika/eval/SimpleComparerTest.java | 442 -----
.../java/org/apache/tika/eval/TikaEvalCLITest.java | 294 ----
.../apache/tika/eval/db/AbstractBufferTest.java | 160 --
.../org/apache/tika/eval/io/ExtractReaderTest.java | 86 -
.../tika/eval/io/FatalExceptionReaderTest.java | 33 -
.../tika/eval/reports/ResultsReporterTest.java | 60 -
.../tika/tools/TopCommonTokenCounterTest.java | 96 --
tika-eval/src/test/resources/common_tokens/en | 28 -
tika-eval/src/test/resources/common_tokens/es | 30 -
tika-eval/src/test/resources/common_tokens/zh-cn | 28 -
tika-eval/src/test/resources/common_tokens/zh-tw | 28 -
.../single-file-profiler-crawl-extract-config.xml | 76 -
.../single-file-profiler-crawl-input-config.xml | 77 -
.../test-dirs/batch-logs/batch-process-fatal.xml | 59 -
.../resources/test-dirs/extractsA/file1.pdf.json | 5 -
.../test-dirs/extractsA/file10_permahang.txt.json | 0
.../test-dirs/extractsA/file11_oom.txt.json | 0
.../test-dirs/extractsA/file12_es.txt.json | 4 -
.../extractsA/file13_attachANotB.doc.json | 11 -
.../extractsA/file14_diffAttachOrder.json | 19 -
.../resources/test-dirs/extractsA/file15_tags.json | 41 -
.../test-dirs/extractsA/file16_badTags.json | 41 -
.../test-dirs/extractsA/file17_tagsOutOfOrder.json | 41 -
.../test-dirs/extractsA/file2_attachANotB.doc.json | 10 -
.../test-dirs/extractsA/file3_attachBNotA.doc.json | 4 -
.../test-dirs/extractsA/file4_emptyB.pdf.json | 4 -
.../test-dirs/extractsA/file5_emptyA.pdf.json | 0
.../test-dirs/extractsA/file6_accessEx.pdf.json | 1 -
.../test-dirs/extractsA/file7_badJson.pdf.json | 4 -
.../test-dirs/extractsA/file8_IOEx.pdf.json | 1 -
.../resources/test-dirs/extractsB/file1.pdf.json | 2 -
.../test-dirs/extractsB/file11_oom.txt.json | 0
.../test-dirs/extractsB/file12_es.txt.json | 4 -
.../test-dirs/extractsB/file13_attachANotB.doc.txt | 1 -
.../extractsB/file14_diffAttachOrder.json | 19 -
.../resources/test-dirs/extractsB/file15_tags.html | 31 -
.../test-dirs/extractsB/file16_badTags.html | 31 -
.../test-dirs/extractsB/file2_attachANotB.doc.json | 4 -
.../test-dirs/extractsB/file3_attachBNotA.doc.json | 10 -
.../test-dirs/extractsB/file4_emptyB.pdf.json | 0
.../test-dirs/extractsB/file5_emptyA.pdf.json | 4 -
.../test-dirs/extractsB/file6_accessEx.pdf.json | 1 -
.../test-dirs/extractsB/file7_badJson.pdf.json | 0
.../test-dirs/extractsB/file8_IOEx.pdf.json | 1 -
.../test/resources/test-dirs/raw_input/file1.pdf | 13 -
.../resources/test-dirs/raw_input/file11_oom.txt | 2 -
.../test-dirs/raw_input/file2_attachANotB.doc | 13 -
.../test-dirs/raw_input/file3_attachBNotA.doc | 13 -
.../resources/test-dirs/raw_input/file4_emptyB.pdf | 13 -
.../resources/test-dirs/raw_input/file5_emptyA.pdf | 13 -
.../test-dirs/raw_input/file6_accessEx.pdf | 13 -
.../test-dirs/raw_input/file7_badJson.pdf | 13 -
.../resources/test-dirs/raw_input/file8_IOEx.pdf | 13 -
.../test-dirs/raw_input/file9_noextract.txt | 1 -
tika-eval/{ => tika-eval-app}/pom.xml | 121 +-
tika-eval/{ => tika-eval-core}/pom.xml | 109 +-
.../tika/eval/core}/langid/LanguageIDWrapper.java | 4 +-
.../core}/metadata/TikaEvalMetadataFilter.java | 16 +-
.../textstats/BasicTokenCountStatsCalculator.java | 4 +-
.../eval/core}/textstats/BytesRefCalculator.java | 2 +-
.../tika/eval/core}/textstats/CommonTokens.java | 12 +-
.../core}/textstats/CommonTokensBhattacharyya.java | 8 +-
.../eval/core}/textstats/CommonTokensCosine.java | 8 +-
.../core}/textstats/CommonTokensHellinger.java | 8 +-
.../core}/textstats/CommonTokensKLDNormed.java | 8 +-
.../core}/textstats/CommonTokensKLDivergence.java | 8 +-
.../textstats/CompositeTextStatsCalculator.java | 8 +-
.../core}/textstats/ContentLengthCalculator.java | 2 +-
.../textstats/LanguageAwareTokenCountStats.java | 4 +-
.../core}/textstats/StringStatsCalculator.java | 2 +-
.../eval/core}/textstats/TextProfileSignature.java | 5 +-
.../eval/core}/textstats/TextSha256Signature.java | 2 +-
.../eval/core}/textstats/TextStatsCalculator.java | 2 +-
.../core}/textstats/TokenCountPriorityQueue.java | 4 +-
.../core}/textstats/TokenCountStatsCalculator.java | 4 +-
.../tika/eval/core}/textstats/TokenEntropy.java | 4 +-
.../tika/eval/core}/textstats/TokenLengths.java | 4 +-
.../tika/eval/core}/textstats/TopNTokens.java | 6 +-
.../eval/core}/textstats/UnicodeBlockCounter.java | 5 +-
.../core}/tokens/AlphaIdeographFilterFactory.java | 2 +-
.../eval/core}/tokens/AnalyzerDeserializer.java | 2 +-
.../tika/eval/core}/tokens/AnalyzerManager.java | 3 +-
.../tokens/CJKBigramAwareLengthFilterFactory.java | 2 +-
.../eval/core}/tokens/CommonTokenCountManager.java | 4 +-
.../tika/eval/core}/tokens/CommonTokenResult.java | 2 +-
.../tika/eval/core}/tokens/ContrastStatistics.java | 2 +-
.../apache/tika/eval/core}/tokens/LangModel.java | 2 +-
.../tika/eval/core}/tokens/TokenContraster.java | 3 +-
.../eval/core}/tokens/TokenCountPriorityQueue.java | 2 +-
.../tika/eval/core}/tokens/TokenCounter.java | 17 +-
.../apache/tika/eval/core}/tokens/TokenCounts.java | 2 +-
.../tika/eval/core}/tokens/TokenIntPair.java | 2 +-
.../tika/eval/core}/tokens/TokenStatistics.java | 2 +-
.../tokens/URLEmailNormalizingFilterFactory.java | 2 +-
.../tika/eval/core}/util/ContentTagParser.java | 2 +-
.../apache/tika/eval/core}/util/ContentTags.java | 2 +-
.../tika/eval/core}/util/EvalExceptionUtils.java | 2 +-
....apache.lucene.analysis.util.TokenFilterFactory | 6 +-
.../org.apache.tika.metadata.filter.MetadataFilter | 2 +-
.../src/main/resources/common_tokens/afr | 0
.../src/main/resources/common_tokens/amh | 0
.../src/main/resources/common_tokens/ara | 0
.../src/main/resources/common_tokens/asm | 0
.../src/main/resources/common_tokens/ast | 0
.../src/main/resources/common_tokens/aze | 0
.../src/main/resources/common_tokens/azj | 0
.../src/main/resources/common_tokens/bak | 0
.../src/main/resources/common_tokens/ban | 0
.../src/main/resources/common_tokens/bel | 0
.../src/main/resources/common_tokens/ben | 0
.../src/main/resources/common_tokens/bos | 0
.../src/main/resources/common_tokens/bre | 0
.../src/main/resources/common_tokens/bul | 0
.../src/main/resources/common_tokens/cat | 0
.../src/main/resources/common_tokens/ceb | 0
.../src/main/resources/common_tokens/ces | 0
.../src/main/resources/common_tokens/che | 0
.../src/main/resources/common_tokens/ckb | 0
.../src/main/resources/common_tokens/cmn | 0
.../src/main/resources/common_tokens/cym | 0
.../src/main/resources/common_tokens/dan | 0
.../src/main/resources/common_tokens/deu | 0
.../src/main/resources/common_tokens/div | 0
.../src/main/resources/common_tokens/ekk | 0
.../src/main/resources/common_tokens/ell | 0
.../src/main/resources/common_tokens/eng | 0
.../src/main/resources/common_tokens/epo | 0
.../src/main/resources/common_tokens/est | 0
.../src/main/resources/common_tokens/eus | 0
.../src/main/resources/common_tokens/fao | 0
.../src/main/resources/common_tokens/fas | 0
.../src/main/resources/common_tokens/fin | 0
.../src/main/resources/common_tokens/fra | 0
.../src/main/resources/common_tokens/fry | 0
.../src/main/resources/common_tokens/gle | 0
.../src/main/resources/common_tokens/glg | 0
.../src/main/resources/common_tokens/gsw | 0
.../src/main/resources/common_tokens/guj | 0
.../src/main/resources/common_tokens/hat | 0
.../src/main/resources/common_tokens/heb | 0
.../src/main/resources/common_tokens/hin | 0
.../src/main/resources/common_tokens/hrv | 0
.../src/main/resources/common_tokens/hun | 0
.../src/main/resources/common_tokens/hye | 0
.../src/main/resources/common_tokens/ind | 0
.../src/main/resources/common_tokens/isl | 0
.../src/main/resources/common_tokens/ita | 0
.../src/main/resources/common_tokens/jav | 0
.../src/main/resources/common_tokens/jpn | 0
.../src/main/resources/common_tokens/kan | 0
.../src/main/resources/common_tokens/kat | 0
.../src/main/resources/common_tokens/kaz | 0
.../src/main/resources/common_tokens/kin | 0
.../src/main/resources/common_tokens/kir | 0
.../src/main/resources/common_tokens/kor | 0
.../src/main/resources/common_tokens/kur | 0
.../src/main/resources/common_tokens/lat | 0
.../src/main/resources/common_tokens/lav | 0
.../src/main/resources/common_tokens/lim | 0
.../src/main/resources/common_tokens/lit | 0
.../src/main/resources/common_tokens/ltz | 0
.../src/main/resources/common_tokens/lug | 0
.../src/main/resources/common_tokens/lvs | 0
.../src/main/resources/common_tokens/mal | 0
.../src/main/resources/common_tokens/mar | 0
.../src/main/resources/common_tokens/mhr | 0
.../src/main/resources/common_tokens/min | 0
.../src/main/resources/common_tokens/mkd | 0
.../src/main/resources/common_tokens/mlg | 0
.../src/main/resources/common_tokens/mlt | 0
.../src/main/resources/common_tokens/mon | 0
.../src/main/resources/common_tokens/mri | 0
.../src/main/resources/common_tokens/msa | 0
.../src/main/resources/common_tokens/nan | 0
.../src/main/resources/common_tokens/nds | 0
.../src/main/resources/common_tokens/nep | 0
.../src/main/resources/common_tokens/nld | 0
.../src/main/resources/common_tokens/nno | 0
.../src/main/resources/common_tokens/nob | 0
.../src/main/resources/common_tokens/oci | 0
.../src/main/resources/common_tokens/ori | 0
.../src/main/resources/common_tokens/pan | 0
.../src/main/resources/common_tokens/pes | 0
.../src/main/resources/common_tokens/plt | 0
.../src/main/resources/common_tokens/pnb | 0
.../src/main/resources/common_tokens/pol | 0
.../src/main/resources/common_tokens/por | 0
.../src/main/resources/common_tokens/pus | 0
.../src/main/resources/common_tokens/ron | 0
.../src/main/resources/common_tokens/rus | 0
.../src/main/resources/common_tokens/san | 0
.../src/main/resources/common_tokens/sin | 0
.../src/main/resources/common_tokens/slk | 0
.../src/main/resources/common_tokens/slv | 0
.../src/main/resources/common_tokens/snd | 0
.../src/main/resources/common_tokens/som | 0
.../src/main/resources/common_tokens/spa | 0
.../src/main/resources/common_tokens/sqi | 0
.../src/main/resources/common_tokens/srp | 0
.../src/main/resources/common_tokens/sun | 0
.../src/main/resources/common_tokens/swa | 0
.../src/main/resources/common_tokens/swe | 0
.../src/main/resources/common_tokens/tam | 0
.../src/main/resources/common_tokens/tat | 0
.../src/main/resources/common_tokens/tel | 0
.../src/main/resources/common_tokens/tgk | 0
.../src/main/resources/common_tokens/tgl | 0
.../src/main/resources/common_tokens/tha | 0
.../src/main/resources/common_tokens/tuk | 0
.../src/main/resources/common_tokens/tur | 0
.../src/main/resources/common_tokens/uig | 0
.../src/main/resources/common_tokens/ukr | 0
.../src/main/resources/common_tokens/urd | 0
.../src/main/resources/common_tokens/uzb | 0
.../src/main/resources/common_tokens/vie | 0
.../src/main/resources/common_tokens/vol | 0
.../src/main/resources/common_tokens/war | 0
.../src/main/resources/common_tokens/xho | 0
.../src/main/resources/common_tokens/yid | 0
.../src/main/resources/common_tokens/zul | 0
.../src/main/resources/log4j.properties | 0
.../src/main/resources/lucene-analyzers.json | 0
.../src/main/resources/lucene-char-mapping.txt | 0
.../apache/tika/eval/core}/langid/LangIdTest.java | 4 +-
.../core}/metadata/TikaEvalMetadataFilterTest.java | 2 +-
.../tika/eval/core}/textstats/TextStatsTest.java | 6 +-
.../tika/eval/core}/tokens/LuceneTokenCounter.java | 2 +-
.../tika/eval/core}/tokens/TokenCounterTest.java | 3 +-
.../tika/eval/core}/util/LanguageIdTest.java | 4 +-
.../apache/tika/eval/core}/util/MimeUtilTest.java | 2 +-
tika-example/pom.xml | 8 +-
.../apache/tika/example/TextStatsFromTikaEval.java | 9 +-
283 files changed, 152 insertions(+), 14136 deletions(-)
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index a16c9dd..172424e 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -22,7 +22,6 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
-
<parent>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parent</artifactId>
@@ -34,219 +33,12 @@
<name>Apache Tika eval</name>
<url>http://tika.apache.org/</url>
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-batch</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-serialization</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-langdetect-opennlp</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-math3</artifactId>
- <version>${commons.math3.version}</version>
- </dependency>
-
- <!-- check if exclusion from oss-index check
- is necessary on next upgrade -->
- <dependency>
- <groupId>com.h2database</groupId>
- <artifactId>h2</artifactId>
- <version>${h2.version}</version>
- </dependency>
- <dependency>
- <groupId>commons-cli</groupId>
- <artifactId>commons-cli</artifactId>
- <version>${commons.cli.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-core</artifactId>
- <version>${lucene.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-common</artifactId>
- <version>${lucene.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers-icu</artifactId>
- <version>${lucene.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-memory</artifactId>
- <version>${lucene.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-lang3</artifactId>
- <version>${commons.lang3.version}</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi</artifactId>
- <version>${poi.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-ooxml</artifactId>
- <version>${poi.version}</version>
- <exclusions>
- <exclusion>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-compress</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-ooxml-schemas</artifactId>
- <version>${poi.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.poi</groupId>
- <artifactId>poi-scratchpad</artifactId>
- <version>${poi.version}</version>
- </dependency>
- <dependency>
- <groupId>org.ccil.cowan.tagsoup</groupId>
- <artifactId>tagsoup</artifactId>
- <version>1.2.1</version>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-batch</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
+ <packaging>pom</packaging>
+ <modules>
+ <module>tika-eval-core</module>
+ <module>tika-eval-app</module>
+ </modules>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <scope>test</scope>
- </dependency>
- </dependencies>
- <build>
- <plugins>
- <plugin>
- <artifactId>maven-shade-plugin</artifactId>
- <version>${maven.shade.version}</version>
- <executions>
- <execution>
- <phase>package</phase>
- <goals>
- <goal>shade</goal>
- </goals>
- <configuration>
- <createDependencyReducedPom>
- false
- </createDependencyReducedPom>
- <!-- <filters> -->
- <transformers>
- <transformer
- implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
- <mainClass>org.apache.tika.eval.TikaEvalCLI</mainClass>
- </transformer>
- <transformer
- implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
- </transformers>
- </configuration>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <version>${maven.bundle.version}</version>
- <extensions>true</extensions>
- <configuration>
- <instructions>
- <Bundle-DocURL>${project.url}</Bundle-DocURL>
- <Bundle-Activator>
- org.apache.tika.config.TikaActivator
- </Bundle-Activator>
- <Bundle-ActivationPolicy>lazy</Bundle-ActivationPolicy>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <groupId>org.apache.rat</groupId>
- <artifactId>apache-rat-plugin</artifactId>
- <version>${rat.version}</version>
- <configuration>
- <excludes>
- <exclude>src/main/resources/common_tokens/*</exclude>
- <exclude>src/main/resources/*.json</exclude>
- <exclude>src/test/resources/common_tokens/*</exclude>
- <exclude>src/test/resources/test-dirs/**</exclude>
- </excludes>
- </configuration>
- </plugin>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <configuration>
- <archive>
- <manifestEntries>
- <Automatic-Module-Name>org.apache.tika.eval</Automatic-Module-Name>
- </manifestEntries>
- </archive>
- </configuration>
- <executions>
- <execution>
- <goals>
- <goal>test-jar</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <artifactId>maven-failsafe-plugin</artifactId>
- <version>${maven.failsafe.version}</version>
- <configuration>
- <additionalClasspathElements>
- <additionalClasspathElement>
- ${project.build.directory}/${project.build.finalName}.jar
- </additionalClasspathElement>
- </additionalClasspathElements>
- </configuration>
- <executions>
- <execution>
- <goals>
- <goal>integration-test</goal>
- <goal>verify</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
</project>
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
deleted file mode 100644
index 432543d..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ /dev/null
@@ -1,825 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval;
-
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.sql.Types;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.io.FilenameUtils;
-import org.apache.commons.lang3.mutable.MutableInt;
-import org.apache.commons.lang3.tuple.Pair;
-import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.batch.fs.FSProperties;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.eval.db.ColInfo;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.ExtractReaderException;
-import org.apache.tika.eval.io.IDBWriter;
-import org.apache.tika.eval.langid.LanguageIDWrapper;
-import org.apache.tika.eval.textstats.BasicTokenCountStatsCalculator;
-import org.apache.tika.eval.textstats.CommonTokens;
-import org.apache.tika.eval.textstats.CompositeTextStatsCalculator;
-import org.apache.tika.eval.textstats.ContentLengthCalculator;
-import org.apache.tika.eval.textstats.TextStatsCalculator;
-import org.apache.tika.eval.textstats.TokenEntropy;
-import org.apache.tika.eval.textstats.TokenLengths;
-import org.apache.tika.eval.textstats.TopNTokens;
-import org.apache.tika.eval.textstats.UnicodeBlockCounter;
-import org.apache.tika.eval.tokens.AnalyzerManager;
-import org.apache.tika.eval.tokens.CommonTokenCountManager;
-import org.apache.tika.eval.tokens.CommonTokenResult;
-import org.apache.tika.eval.tokens.TokenCounts;
-import org.apache.tika.eval.tokens.TokenIntPair;
-import org.apache.tika.eval.util.ContentTagParser;
-import org.apache.tika.eval.util.ContentTags;
-import org.apache.tika.eval.util.EvalExceptionUtils;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.language.detect.LanguageResult;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.PagedText;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
-import org.apache.tika.sax.RecursiveParserWrapperHandler;
-import org.apache.tika.sax.ToXMLContentHandler;
-import org.apache.tika.utils.ExceptionUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.SAXException;
-
-public abstract class AbstractProfiler extends FileResourceConsumer {
-
- private static final Logger LOG = LoggerFactory.getLogger(AbstractProfiler.class);
-
- private static final String[] EXTRACT_EXTENSIONS = {
- ".json",
- ".txt",
- ""
- };
-
- private static final String[] COMPRESSION_EXTENSIONS = {
- "",
- ".bz2",
- ".gzip",
- ".zip",
- };
- static final long NON_EXISTENT_FILE_LENGTH = -1l;
-
- public static TableInfo REF_EXTRACT_EXCEPTION_TYPES = new TableInfo("ref_extract_exception_types",
- new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER),
- new ColInfo(Cols.EXTRACT_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
- );
-
-
- public static TableInfo REF_PARSE_ERROR_TYPES = new TableInfo("ref_parse_error_types",
- new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER),
- new ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128)
- );
-
- public static TableInfo REF_PARSE_EXCEPTION_TYPES = new TableInfo("ref_parse_exception_types",
- new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER),
- new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
- );
-
- public static final String TRUE = Boolean.toString(true);
- public static final String FALSE = Boolean.toString(false);
- private static final String ZERO = "0";
-
-
- protected static final AtomicInteger ID = new AtomicInteger();
-
- private static final String UNKNOWN_EXTENSION = "unk";
- //make this configurable
- private static final String DIGEST_KEY = "X-TIKA:digest:MD5";
-
- private static final Map<String, Cols> UC_TAGS_OF_INTEREST = initTags();
-
- private static Map<String, Cols> initTags() {
- //simplify this mess
- Map<String, Cols> tmp = new HashMap<>();
- tmp.put("A", Cols.TAGS_A);
- tmp.put("B", Cols.TAGS_B);
- tmp.put("DIV", Cols.TAGS_DIV);
- tmp.put("I", Cols.TAGS_I);
- tmp.put("IMG", Cols.TAGS_IMG);
- tmp.put("LI", Cols.TAGS_LI);
- tmp.put("OL", Cols.TAGS_OL);
- tmp.put("P", Cols.TAGS_P);
- tmp.put("TABLE", Cols.TAGS_TABLE);
- tmp.put("TD", Cols.TAGS_TD);
- tmp.put("TITLE", Cols.TAGS_TITLE);
- tmp.put("TR", Cols.TAGS_TR);
- tmp.put("U", Cols.TAGS_U);
- tmp.put("UL", Cols.TAGS_UL);
- return Collections.unmodifiableMap(tmp);
- }
-
- private static CommonTokenCountManager COMMON_TOKEN_COUNT_MANAGER;
-
- private String lastExtractExtension = null;
-
- AnalyzerManager analyzerManager;
-
-
- public enum EXCEPTION_TYPE {
- RUNTIME,
- ENCRYPTION,
- ACCESS_PERMISSION,
- UNSUPPORTED_VERSION,
- }
-
- /**
- * If information was gathered from the log file about
- * a parse error
- */
- public enum PARSE_ERROR_TYPE {
- OOM,
- TIMEOUT
- }
-
- public static TableInfo MIME_TABLE = new TableInfo("mimes",
- new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
- new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
- new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
- );
-
- private static Pattern FILE_NAME_CLEANER = Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
-
-
- final static int FILE_PATH_MAX_LEN = 1024;//max len for varchar for file_path
- int maxContentLength = 10000000;
- int maxContentLengthForLangId = 50000;
- int maxTokens = 200000;
-
-
-
- private final static Pattern ACCESS_PERMISSION_EXCEPTION =
- Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException");
- private final static Pattern ENCRYPTION_EXCEPTION =
- Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException");
-
- private static LanguageIDWrapper LANG_ID = new LanguageIDWrapper();
-
- //TODO: allow configuration
- //private TikaConfig config = TikaConfig.getDefaultConfig();
- CompositeTextStatsCalculator compositeTextStatsCalculator;
- protected IDBWriter writer;
-
- /**
- * @param p path to the common_tokens directory. If this is null, try to load from classPath
- * @param defaultLangCode this is the language code to use if a common_words list doesn't exist for the
- * detected langauge; can be <code>null</code>
- * @throws IOException
- */
- public static void loadCommonTokens(Path p, String defaultLangCode) throws IOException {
- COMMON_TOKEN_COUNT_MANAGER = new CommonTokenCountManager(p, defaultLangCode);
- }
-
- public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue,
- IDBWriter writer) {
- super(fileQueue);
- this.writer = writer;
- LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
- this.compositeTextStatsCalculator = initAnalyzersAndTokenCounter(maxTokens, LANG_ID);
- }
-
- private CompositeTextStatsCalculator initAnalyzersAndTokenCounter(int maxTokens, LanguageIDWrapper langIder) {
- analyzerManager = AnalyzerManager.newInstance(maxTokens);
- List<TextStatsCalculator> calculators = new ArrayList<>();
- calculators.add(new CommonTokens(COMMON_TOKEN_COUNT_MANAGER));
- calculators.add(new TokenEntropy());
- calculators.add(new TokenLengths());
- calculators.add(new TopNTokens(10));
- calculators.add(new BasicTokenCountStatsCalculator());
- calculators.add(new ContentLengthCalculator());
- calculators.add(new UnicodeBlockCounter(maxContentLengthForLangId));
-
- return new CompositeTextStatsCalculator(calculators, analyzerManager.getGeneralAnalyzer(), langIder);
- }
-
- /**
- * Truncate the content string if greater than this length to this length
- *
- * @param maxContentLength
- */
- public void setMaxContentLength(int maxContentLength) {
- this.maxContentLength = maxContentLength;
- }
-
- /**
- * Truncate content string if greater than this length to this length for lang id
- *
- * @param maxContentLengthForLangId
- */
- public void setMaxContentLengthForLangId(int maxContentLengthForLangId) {
- this.maxContentLengthForLangId = maxContentLengthForLangId;
- LanguageIDWrapper.setMaxTextLength(maxContentLengthForLangId);
- }
-
- /**
- * Add a LimitTokenCountFilterFactory if > -1
- *
- * @param maxTokens
- */
- public void setMaxTokens(int maxTokens) {
- this.maxTokens = maxTokens;
- initAnalyzersAndTokenCounter(maxTokens, new LanguageIDWrapper());
- }
-
-
- protected void writeExtractException(TableInfo extractExceptionTable, String containerId,
- String filePath, ExtractReaderException.TYPE type) throws IOException {
- Map<Cols, String> data = new HashMap<>();
- data.put(Cols.CONTAINER_ID, containerId);
- data.put(Cols.FILE_PATH, filePath);
- data.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(type.ordinal()));
- writer.writeRow(extractExceptionTable, data);
-
- }
-
- protected void writeProfileData(EvalFilePaths fps, int i,
- ContentTags contentTags, Metadata m,
- String fileId, String containerId,
- List<Integer> numAttachments, TableInfo profileTable) {
-
- Map<Cols, String> data = new HashMap<>();
- data.put(Cols.ID, fileId);
- data.put(Cols.CONTAINER_ID, containerId);
- data.put(Cols.MD5, m.get(DIGEST_KEY));
-
- if (i < numAttachments.size()) {
- data.put(Cols.NUM_ATTACHMENTS, Integer.toString(numAttachments.get(i)));
- }
- data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
- data.put(Cols.NUM_METADATA_VALUES,
- Integer.toString(countMetadataValues(m)));
-
- Integer nPages = m.getInt(PagedText.N_PAGES);
- if (nPages != null) {
- data.put(Cols.NUM_PAGES, Integer.toString(nPages));
- }
-
- //if the outer wrapper document
- if (i == 0) {
- data.put(Cols.IS_EMBEDDED, FALSE);
- data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString());
- } else {
- data.put(Cols.IS_EMBEDDED, TRUE);
- data.put(Cols.FILE_NAME, getFileName(m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH)));
- }
- String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
- ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
- data.put(Cols.FILE_EXTENSION, ext);
- long srcFileLen = getSourceFileLength(m);
- if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
- data.put(Cols.LENGTH, Long.toString(srcFileLen));
- } else {
- data.put(Cols.LENGTH, "");
- }
- int numMetadataValues = countMetadataValues(m);
- data.put(Cols.NUM_METADATA_VALUES,
- Integer.toString(numMetadataValues));
-
- data.put(Cols.ELAPSED_TIME_MILLIS,
- getTime(m));
-
- String content = contentTags.getContent();
- if (content == null || content.trim().length() == 0) {
- data.put(Cols.HAS_CONTENT, FALSE);
- } else {
- data.put(Cols.HAS_CONTENT, TRUE);
- }
- getFileTypes(m, data);
- try {
- writer.writeRow(profileTable, data);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- private static String getFileName(String path) {
- if (path == null) {
- return "";
- }
- //filenameUtils checks for a null byte in the path.
- //it will throw an IllegalArgumentException if there is a null byte.
- //given that we're recording names and not using them on a file path
- //we should ignore this.
- try {
- return FilenameUtils.getName(path);
- } catch (IllegalArgumentException e) {
- LOG.warn("{} in {}", e.getMessage(), path);
- }
- path = path.replaceAll("\u0000", " ");
- try {
- return FilenameUtils.getName(path);
- } catch (IllegalArgumentException e) {
- LOG.warn("Again: {} in {}", e.getMessage(), path);
- }
- //give up
- return "";
- }
-
- protected void writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable) {
- Map<Cols, String> data = new HashMap<>();
- getExceptionStrings(m, data);
- if (data.keySet().size() > 0) {
- try {
- data.put(Cols.ID, fileId);
- writer.writeRow(exceptionTable, data);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- }
-
- protected Map<Class, Object> calcTextStats(ContentTags contentTags) {
-/* if (contentTags == ContentTags.EMPTY_CONTENT_TAGS) {
- return Collections.EMPTY_MAP;
- }*/
- Map<Cols, String> data = new HashMap<>();
- String content = truncateContent(contentTags, maxContentLength, data);
- if (content == null || content.trim().length() == 0) {
- content = "";
- }
- return compositeTextStatsCalculator.calculate(content);
- }
-
- /**
- * Checks to see if metadata is null or content is empty (null or only whitespace).
- * If any of these, then this does no processing, and the fileId is not
- * entered into the content table.
- *
- * @param fileId
- * @param textStats
- * @param contentsTable
- */
- protected void writeContentData(String fileId, Map<Class, Object> textStats, TableInfo contentsTable) throws IOException {
- Map<Cols, String> data = new HashMap<>();
- data.put(Cols.ID, fileId);
- if (textStats.containsKey(ContentLengthCalculator.class)) {
- int length = (int)textStats.get(ContentLengthCalculator.class);
- if (length == 0) {
- return;
- }
- data.put(Cols.CONTENT_LENGTH, Integer.toString(length));
- }
- langid(textStats, data);
-
- writeTokenCounts(textStats, data);
- CommonTokenResult commonTokenResult = (CommonTokenResult) textStats.get(CommonTokens.class);
- if (commonTokenResult != null) {
- data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
- data.put(Cols.NUM_UNIQUE_COMMON_TOKENS, Integer.toString(commonTokenResult.getUniqueCommonTokens()));
- data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getCommonTokens()));
- data.put(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS,
- Integer.toString(commonTokenResult.getUniqueAlphabeticTokens()));
- data.put(Cols.NUM_ALPHABETIC_TOKENS,
- Integer.toString(commonTokenResult.getAlphabeticTokens()));
- }
- TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class);
- if (tokenCounts != null) {
-
- data.put(Cols.NUM_UNIQUE_TOKENS,
- Integer.toString(tokenCounts.getTotalUniqueTokens()));
- data.put(Cols.NUM_TOKENS,
- Integer.toString(tokenCounts.getTotalTokens()));
- }
- if (textStats.get(TokenEntropy.class) != null) {
- data.put(Cols.TOKEN_ENTROPY_RATE,
- Double.toString((Double) textStats.get(TokenEntropy.class)));
- }
-
- SummaryStatistics summStats = (SummaryStatistics) textStats.get(TokenLengths.class);
- if (summStats != null) {
- data.put(Cols.TOKEN_LENGTH_SUM,
- Integer.toString((int) summStats.getSum()));
-
- data.put(Cols.TOKEN_LENGTH_MEAN,
- Double.toString(summStats.getMean()));
-
- data.put(Cols.TOKEN_LENGTH_STD_DEV,
- Double.toString(summStats.getStandardDeviation()));
- }
- unicodeBlocks(textStats, data);
- try {
- writer.writeRow(contentsTable, data);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
- void writeTagData(String fileId, ContentTags contentTags, TableInfo tagsTable) {
- Map<String, Integer> tags = contentTags.getTags();
- if (tags.size() == 0 && contentTags.getParseException() == false) {
- return;
- }
- Map<Cols, String> data = new HashMap<>();
- data.put(Cols.ID, fileId);
-
- for (Map.Entry<String, Cols> e : UC_TAGS_OF_INTEREST.entrySet()) {
- Integer count = tags.get(e.getKey());
- if (count == null) {
- data.put(e.getValue(), ZERO);
- } else {
- data.put(e.getValue(), Integer.toString(count));
- }
- }
-
- if (contentTags.getParseException()) {
- data.put(Cols.TAGS_PARSE_EXCEPTION, TRUE);
- } else {
- data.put(Cols.TAGS_PARSE_EXCEPTION, FALSE);
- }
- try {
- writer.writeRow(tagsTable, data);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-
-
- String getTime(Metadata m) {
- String elapsed = "-1";
-
- String v = m.get(AbstractRecursiveParserWrapperHandler.PARSE_TIME_MILLIS);
- if (v != null) {
- return v;
- }
- return elapsed;
- }
-
- int countMetadataValues(Metadata m) {
- if (m == null) {
- return 0;
- }
- int i = 0;
- for (String n : m.names()) {
- i += m.getValues(n).length;
- }
- return i;
- }
-
- void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {
-
- String fullTrace = metadata.get(RecursiveParserWrapperHandler.CONTAINER_EXCEPTION);
-
- if (fullTrace == null) {
- fullTrace = metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION);
- }
-
- if (fullTrace != null) {
- //check for "expected" exceptions...exceptions
- //that can't be fixed.
- //Do not store trace for "expected" exceptions
-
- Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace);
- if (matcher.find()) {
- data.put(Cols.PARSE_EXCEPTION_ID,
- Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()));
- return;
- }
- matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace);
- if (matcher.find()) {
- data.put(Cols.PARSE_EXCEPTION_ID,
- Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal()));
- return;
- }
-
- data.put(Cols.PARSE_EXCEPTION_ID,
- Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal()));
-
- data.put(Cols.ORIG_STACK_TRACE, fullTrace);
- //TikaExceptions can have object ids, as in the "@2b1ea6ee" in:
- //org.apache.tika.exception.TikaException: TIKA-198: Illegal
- //IOException from org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee
- //For reporting purposes, let's snip off the object id so that we can more
- //easily count exceptions.
- String sortTrace = EvalExceptionUtils.normalize(fullTrace);
- data.put(Cols.SORT_STACK_TRACE, sortTrace);
- }
- }
-
- /**
- * Get the content and record in the data {@link Cols#CONTENT_TRUNCATED_AT_MAX_LEN} whether the string was truncated
- *
- * @param contentTags
- * @param maxLength
- * @param data
- * @return
- */
- protected static String truncateContent(ContentTags contentTags, int maxLength, Map<Cols, String> data) {
- data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "FALSE");
- if (contentTags == null) {
- return "";
- }
- String c = contentTags.getContent();
- if (maxLength > -1 && c.length() > maxLength) {
- c = c.substring(0, maxLength);
- data.put(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, "TRUE");
- }
- return c;
-
- }
-
- protected static ContentTags getContent(EvalFilePaths evalFilePaths, Metadata metadata) {
- if (metadata == null) {
- return ContentTags.EMPTY_CONTENT_TAGS;
- }
- return parseContentAndTags(evalFilePaths, metadata);
- }
-
- void unicodeBlocks(Map<Class, Object> tokenStats, Map<Cols, String> data) {
-
- Map<String, MutableInt> blocks = (Map<String, MutableInt>) tokenStats.get(UnicodeBlockCounter.class);
- List<Pair<String, Integer>> pairs = new ArrayList<>();
- for (Map.Entry<String, MutableInt> e : blocks.entrySet()) {
- pairs.add(Pair.of(e.getKey(), e.getValue().intValue()));
- }
- pairs.sort((o1, o2) -> o2.getValue().compareTo(o1.getValue()));
- StringBuilder sb = new StringBuilder();
-
- for (int i = 0; i < 20 && i < pairs.size(); i++) {
- if (i > 0) {
- sb.append(" | ");
- }
- sb.append(pairs.get(i).getKey() + ": " + pairs.get(i).getValue());
- }
- data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
- }
-
- void langid(Map<Class, Object> stats, Map<Cols, String> data) {
- List<LanguageResult> probabilities = (List<LanguageResult>) stats.get(LanguageIDWrapper.class);
-
- if (probabilities.size() > 0) {
- data.put(Cols.LANG_ID_1, probabilities.get(0).getLanguage());
- data.put(Cols.LANG_ID_PROB_1,
- Double.toString(probabilities.get(0).getRawScore()));
- }
- if (probabilities.size() > 1) {
- data.put(Cols.LANG_ID_2, probabilities.get(1).getLanguage());
- data.put(Cols.LANG_ID_PROB_2,
- Double.toString(probabilities.get(1).getRawScore()));
- }
- }
-
- void getFileTypes(Metadata metadata, Map<Cols, String> output) {
- if (metadata == null) {
- return;
- }
- String type = metadata.get(Metadata.CONTENT_TYPE);
- if (type == null) {
- return;
- }
- int mimeId = writer.getMimeId(type);
- output.put(Cols.MIME_ID, Integer.toString(mimeId));
- }
-
- void writeTokenCounts(Map<Class, Object> textStats, Map<Cols, String> data) {
- TokenIntPair[] tokenIntPairs = (TokenIntPair[]) textStats.get(TopNTokens.class);
- int i = 0;
- StringBuilder sb = new StringBuilder();
- for (TokenIntPair t : tokenIntPairs) {
- if (i++ > 0) {
- sb.append(" | ");
- }
- sb.append(t.getToken() + ": " + t.getValue());
- }
-
- data.put(Cols.TOP_N_TOKENS, sb.toString());
- }
-
-
- public void closeWriter() throws IOException {
- writer.close();
- }
-
-
- /**
- * @param metadata
- * @param extracts
- * @return evalfilepaths for files if crawling an extract directory
- */
- protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata,
- Path extracts) {
- String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
- Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
- Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
- //just try slapping the relextractfilepath on the extractdir
- Path extractFile = extracts.resolve(relExtractFilePath);
- if (!Files.isRegularFile(extractFile)) {
- //if that doesn't work, try to find the right extract file.
- //This is necessary if crawling extractsA and trying to find a file in
- //extractsB that is not in the same format: json vs txt or compressed
- extractFile = findFile(extracts, relativeSourceFilePath);
- }
- return new EvalFilePaths(relativeSourceFilePath, extractFile);
- }
-
- //call this if the crawler is crawling through the src directory
- protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir,
- Path extracts) {
- Path relativeSourceFilePath = Paths.get(metadata.get(FSProperties.FS_REL_PATH));
- Path extractFile = findFile(extracts, relativeSourceFilePath);
- Path inputFile = srcDir.resolve(relativeSourceFilePath);
- long srcLen = -1l;
- //try to get the length of the source file in case there was an error
- //in both extracts
- try {
- srcLen = Files.size(inputFile);
- } catch (IOException e) {
- LOG.warn("Couldn't get length for: {}", inputFile.toAbsolutePath());
- }
- return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen);
- }
-
- /**
- * @param extractRootDir
- * @param relativeSourceFilePath
- * @return extractFile or null if couldn't find one.
- */
- private Path findFile(Path extractRootDir, Path relativeSourceFilePath) {
- String relSrcFilePathString = relativeSourceFilePath.toString();
- if (lastExtractExtension != null) {
- Path candidate = extractRootDir.resolve(relSrcFilePathString + lastExtractExtension);
- if (Files.isRegularFile(candidate)) {
- return candidate;
- }
- }
- for (String ext : EXTRACT_EXTENSIONS) {
- for (String compress : COMPRESSION_EXTENSIONS) {
- Path candidate = extractRootDir.resolve(relSrcFilePathString + ext + compress);
- if (Files.isRegularFile(candidate)) {
- lastExtractExtension = ext + compress;
- return candidate;
- }
- }
- }
- return null;
- }
-
- protected long getSourceFileLength(EvalFilePaths fps, List<Metadata> metadataList) {
- if (fps.getSourceFileLength() > NON_EXISTENT_FILE_LENGTH) {
- return fps.getSourceFileLength();
- }
- return getSourceFileLength(metadataList);
- }
-
- long getSourceFileLength(List<Metadata> metadataList) {
- if (metadataList == null || metadataList.size() < 1) {
- return NON_EXISTENT_FILE_LENGTH;
- }
- return getSourceFileLength(metadataList.get(0));
- }
-
- long getSourceFileLength(Metadata m) {
- String lenString = m.get(Metadata.CONTENT_LENGTH);
- if (lenString == null) {
- return NON_EXISTENT_FILE_LENGTH;
- }
- try {
- return Long.parseLong(lenString);
- } catch (NumberFormatException e) {
- //swallow
- }
- return NON_EXISTENT_FILE_LENGTH;
- }
-
- protected long getFileLength(Path p) {
- if (p != null && Files.isRegularFile(p)) {
- try {
- return Files.size(p);
- } catch (IOException e) {
- //swallow
- }
- }
- return NON_EXISTENT_FILE_LENGTH;
- }
-
- /**
- * @param list
- * @return empty list if input list is empty or null
- */
- static List<Integer> countAttachments(List<Metadata> list) {
- List<Integer> ret = new ArrayList<>();
- if (list == null || list.size() == 0) {
- return ret;
- }
- //container document attachment count = list.size()-1
- ret.add(list.size() - 1);
-
- Map<String, Integer> counts = new HashMap<>();
- for (int i = 1; i < list.size(); i++) {
- String path = list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
- if (path == null) {
- //shouldn't ever happen
- continue;
- }
- String[] parts = path.split("/");
- StringBuilder parent = new StringBuilder();
- for (int end = 1; end < parts.length - 1; end++) {
- parent.setLength(0);
- join("/", parent, parts, 1, end);
- String parentPath = parent.toString();
- Integer count = counts.get(parentPath);
- if (count == null) {
- count = 1;
- } else {
- count++;
- }
- counts.put(parentPath, count);
- }
- }
-
- for (int i = 1; i < list.size(); i++) {
- Integer count = counts.get(list.get(i).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
- if (count == null) {
- count = 0;
- }
- ret.add(i, count);
- }
- return ret;
-
-
- }
-
- private static void join(String delimiter, StringBuilder sb, String[] parts, int start, int end) {
- for (int i = start; i <= end; i++) {
- sb.append(delimiter);
- sb.append(parts[i]);
- }
- }
-
- private static ContentTags parseContentAndTags(EvalFilePaths evalFilePaths, Metadata metadata) {
- String s = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT);
- if (s == null || s.length() == 0) {
- return ContentTags.EMPTY_CONTENT_TAGS;
- }
-
- String handlerClass = metadata.get(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER);
- if (evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".html")) {
- try {
- return ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
- } catch (IOException | SAXException e) {
- LOG.warn("Problem parsing html in {}; backing off to treat string as text",
- evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
-
- return new ContentTags(s, true);
- }
- } else if (
- evalFilePaths.getExtractFile().getFileName().toString().toLowerCase(Locale.ENGLISH).endsWith(".xhtml") ||
- (handlerClass != null && handlerClass.equals(ToXMLContentHandler.class.getSimpleName()))) {
- try {
- return ContentTagParser.parseXML(s, UC_TAGS_OF_INTEREST.keySet());
- } catch (TikaException | IOException | SAXException e) {
- LOG.warn("Problem parsing xhtml in {}; backing off to html parser",
- evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
- try {
- ContentTags contentTags = ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
- contentTags.setParseException(true);
- return contentTags;
- } catch (IOException | SAXException e2) {
- LOG.warn("Problem parsing html in {}; backing off to treat string as text",
- evalFilePaths.getExtractFile().toAbsolutePath().toString(), e2);
- }
- return new ContentTags(s, true);
- }
- }
- return new ContentTags(s);
- }
-
-
-}
-
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java b/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java
deleted file mode 100644
index a760b86..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval;
-
-import static org.apache.tika.eval.AbstractProfiler.NON_EXISTENT_FILE_LENGTH;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Objects;
-
-/**
- * Simple struct to keep track of relative path of source file (
- * original binary file, e.g. /subdir/document1.doc)
- * and the extract file (e.g. /subdir/document1.doc.json).
- */
-class EvalFilePaths {
-
- private final Path relativeSourceFilePath;
- private final Path extractFile;
-
- private long sourceFileLength = NON_EXISTENT_FILE_LENGTH;
- private long extractFileLength = NON_EXISTENT_FILE_LENGTH;
-
-
- public EvalFilePaths(Path relativeSourceFilePath, Path extractFile, long srcFileLen) {
- this(relativeSourceFilePath, extractFile);
- this.sourceFileLength = srcFileLen;
- }
-
- public EvalFilePaths(Path relativeSourceFilePath, Path extractFile) {
- if (extractFile != null && Files.isRegularFile(extractFile)) {
- try {
- extractFileLength = Files.size(extractFile);
- } catch (IOException e) {
- //swallow ?
- }
- }
- this.relativeSourceFilePath = relativeSourceFilePath;
- this.extractFile = extractFile;
- }
-
- public Path getRelativeSourceFilePath() {
- return relativeSourceFilePath;
- }
-
- //this path may or may not exist and it could be null!
- public Path getExtractFile() {
- return extractFile;
- }
-
- //if it doesn't exist, it'll be -1l.
- public long getSourceFileLength() {
- return sourceFileLength;
- }
-
- public long getExtractFileLength() {
- return extractFileLength;
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- EvalFilePaths that = (EvalFilePaths) o;
-
- if (sourceFileLength != that.sourceFileLength) {
- return false;
- }
- if (extractFileLength != that.extractFileLength) {
- return false;
- }
- if (!Objects.equals(relativeSourceFilePath, that.relativeSourceFilePath)) {
- return false;
- }
- return Objects.equals(extractFile, that.extractFile);
-
- }
-
- @Override
- public int hashCode() {
- int result = relativeSourceFilePath != null ? relativeSourceFilePath.hashCode() : 0;
- result = 31 * result + (extractFile != null ? extractFile.hashCode() : 0);
- result = 31 * result + (int) (sourceFileLength ^ (sourceFileLength >>> 32));
- result = 31 * result + (int) (extractFileLength ^ (extractFileLength >>> 32));
- return result;
- }
-
- @Override
- public String toString() {
- return "EvalFilePaths{" +
- "relativeSourceFilePath=" + relativeSourceFilePath +
- ", extractFile=" + extractFile +
- ", sourceFileLength=" + sourceFileLength +
- ", extractFileLength=" + extractFileLength +
- '}';
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
deleted file mode 100644
index 79df621..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ /dev/null
@@ -1,538 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.Types;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.ArrayBlockingQueue;
-
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.commons.io.FilenameUtils;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.fs.FSProperties;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.eval.db.ColInfo;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.ExtractReader;
-import org.apache.tika.eval.io.ExtractReaderException;
-import org.apache.tika.eval.io.IDBWriter;
-import org.apache.tika.eval.textstats.BasicTokenCountStatsCalculator;
-import org.apache.tika.eval.tokens.ContrastStatistics;
-import org.apache.tika.eval.tokens.TokenContraster;
-import org.apache.tika.eval.tokens.TokenCounts;
-import org.apache.tika.eval.tokens.TokenIntPair;
-import org.apache.tika.eval.util.ContentTags;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
-
-public class ExtractComparer extends AbstractProfiler {
-
- static Options OPTIONS;
- static {
- Option extractsA = new Option("extractsA", true, "directory for extractsA files");
- extractsA.setRequired(true);
-
- Option extractsB = new Option("extractsB", true, "directory for extractsB files");
- extractsB.setRequired(true);
-
- Option inputDir = new Option("inputDir", true,
- "optional: directory of original binary input files if it exists " +
- "or can be the same as -extractsA or -extractsB. If not specified, -inputDir=-extractsA");
-
-
- OPTIONS = new Options()
- .addOption(extractsA)
- .addOption(extractsB)
- .addOption(inputDir)
- .addOption("bc", "optional: tika-batch config file")
- .addOption("numConsumers", true, "optional: number of consumer threads")
- .addOption(new Option("alterExtract", true,
- "for json-formatted extract files, " +
- "process full metadata list ('as_is'=default), " +
- "take just the first/container document ('first_only'), " +
- "concatenate all content into the first metadata item ('concatenate_content')"))
- .addOption("minExtractLength", true, "minimum extract length to process (in bytes)")
- .addOption("maxExtractLength", true, "maximum extract length to process (in bytes)")
- .addOption("db", true, "db file to which to write results")
- .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
- .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
- .addOption("tablePrefixA", true, "EXPERT: optional prefix for table names for A")
- .addOption("tablePrefixB", true, "EXPERT: optional prefix for table names for B")
- .addOption("drop", false, "drop tables if they exist")
- .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
- .addOption("maxTokens", true, "maximum tokens to process, default=200000")
- .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
- .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
- .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")
- ;
- }
-
- private static final String DIGEST_KEY_PREFIX = TikaCoreProperties.TIKA_META_PREFIX+
- "digest"+TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
-
- public static void USAGE() {
- HelpFormatter helpFormatter = new HelpFormatter();
- helpFormatter.printHelp(
- 80,
- "java -jar tika-eval-x.y.jar Compare -extractsA extractsA -extractsB extractsB -db mydb",
- "Tool: Compare",
- ExtractComparer.OPTIONS,
- "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
- }
-
- private final static String FIELD_A = "fa";
- private final static String FIELD_B = "fb";
-
- public static TableInfo REF_PAIR_NAMES = new TableInfo("pair_names",
- new ColInfo(Cols.DIR_NAME_A, Types.VARCHAR, 128),
- new ColInfo(Cols.DIR_NAME_B, Types.VARCHAR, 128)
- );
-
- public static TableInfo COMPARISON_CONTAINERS = new TableInfo("containers",
- new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"),
- new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
- new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
- new ColInfo(Cols.LENGTH, Types.BIGINT),
- new ColInfo(Cols.EXTRACT_FILE_LENGTH_A, Types.BIGINT),
- new ColInfo(Cols.EXTRACT_FILE_LENGTH_B, Types.BIGINT)
- );
-
- public static TableInfo CONTENT_COMPARISONS = new TableInfo("content_comparisons",
- new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
- new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, Types.VARCHAR, 1024),
- new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, Types.VARCHAR, 1024),
- new ColInfo(Cols.TOP_10_MORE_IN_A, Types.VARCHAR, 1024),
- new ColInfo(Cols.TOP_10_MORE_IN_B, Types.VARCHAR, 1024),
- new ColInfo(Cols.DICE_COEFFICIENT, Types.FLOAT),
- new ColInfo(Cols.OVERLAP, Types.FLOAT)
- );
-
- public static TableInfo PROFILES_A = new TableInfo( "profiles_a",
- ExtractProfiler.PROFILE_TABLE.getColInfos());
-
- public static TableInfo PROFILES_B = new TableInfo( "profiles_b",
- ExtractProfiler.PROFILE_TABLE.getColInfos());
-
- public static TableInfo EMBEDDED_FILE_PATH_TABLE_A = new TableInfo( "emb_path_a",
- ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
-
- public static TableInfo EMBEDDED_FILE_PATH_TABLE_B = new TableInfo( "emb_path_b",
- ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
-
-
- public static TableInfo CONTENTS_TABLE_A = new TableInfo( "contents_a",
- ExtractProfiler.CONTENTS_TABLE.getColInfos());
-
- public static TableInfo CONTENTS_TABLE_B = new TableInfo( "contents_b",
- ExtractProfiler.CONTENTS_TABLE.getColInfos());
-
- public static TableInfo TAGS_TABLE_A = new TableInfo( "tags_a",
- ExtractProfiler.TAGS_TABLE.getColInfos());
-
- public static TableInfo TAGS_TABLE_B = new TableInfo( "tags_b",
- ExtractProfiler.TAGS_TABLE.getColInfos());
-
-
- public static TableInfo EXCEPTION_TABLE_A = new TableInfo ("exceptions_a",
- ExtractProfiler.EXCEPTION_TABLE.getColInfos());
-
- public static TableInfo EXCEPTION_TABLE_B = new TableInfo ("exceptions_b",
- ExtractProfiler.EXCEPTION_TABLE.getColInfos());
-
- public static TableInfo EXTRACT_EXCEPTION_TABLE_A = new TableInfo("extract_exceptions_a",
- ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
- public static TableInfo EXTRACT_EXCEPTION_TABLE_B = new TableInfo("extract_exceptions_b",
- ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
-
-
- //need to parameterize?
- private final TikaConfig config = TikaConfig.getDefaultConfig();
-
- private final Path inputDir;
- private final Path extractsA;
- private final Path extractsB;
-
- private final TokenContraster tokenContraster = new TokenContraster();
- private final ExtractReader extractReader;
-
- public ExtractComparer(ArrayBlockingQueue<FileResource> queue,
- Path inputDir, Path extractsA, Path extractsB, ExtractReader extractReader,
- IDBWriter writer) {
- super(queue, writer);
- this.inputDir = inputDir;
- this.extractsA = extractsA;
- this.extractsB = extractsB;
- this.extractReader = extractReader;
- }
-
- @Override
- public boolean processFileResource(FileResource fileResource) {
- Metadata metadata = fileResource.getMetadata();
- EvalFilePaths fpsA = null;
- EvalFilePaths fpsB = null;
-
- if (inputDir != null && (inputDir.equals(extractsA) ||
- inputDir.equals(extractsB))) {
- //crawling an extract dir
- fpsA = getPathsFromExtractCrawl(metadata, extractsA);
- fpsB = getPathsFromExtractCrawl(metadata, extractsB);
-
- } else {
- fpsA = getPathsFromSrcCrawl(metadata, inputDir, extractsA);
- fpsB = getPathsFromSrcCrawl(metadata, inputDir, extractsB);
- }
-
- try {
- compareFiles(fpsA, fpsB);
- } catch (Throwable e) {
- //this should be cataclysmic...
- throw new RuntimeException("Exception while working on: " +
- metadata.get(FSProperties.FS_REL_PATH), e);
- }
- return true;
- }
-
- //protected for testing, should find better way so that this can be private!
- protected void compareFiles(EvalFilePaths fpsA,
- EvalFilePaths fpsB) throws IOException {
-
- ExtractReaderException.TYPE extractExceptionA = null;
- ExtractReaderException.TYPE extractExceptionB = null;
-
- List<Metadata> metadataListA = null;
- if (extractExceptionA == null) {
- try {
- metadataListA = extractReader.loadExtract(fpsA.getExtractFile());
- } catch (ExtractReaderException e) {
- extractExceptionA = e.getType();
- }
- }
-
- List<Metadata> metadataListB = null;
- try {
- metadataListB = extractReader.loadExtract(fpsB.getExtractFile());
- } catch (ExtractReaderException e) {
- extractExceptionB = e.getType();
- }
-
- //array indices for those metadata items handled in B
- Set<Integer> handledB = new HashSet<>();
- String containerID = Integer.toString(ID.getAndIncrement());
- //container table
- Map<Cols, String> contData = new HashMap<>();
- contData.put(Cols.CONTAINER_ID, containerID);
- contData.put(Cols.FILE_PATH, fpsA.getRelativeSourceFilePath().toString());
- long srcFileLength = getSourceFileLength(metadataListA, metadataListB);
- contData.put(Cols.LENGTH,
- srcFileLength > NON_EXISTENT_FILE_LENGTH ?
- Long.toString(srcFileLength) : "");
- contData.put(Cols.FILE_EXTENSION,
- FilenameUtils.getExtension(fpsA.getRelativeSourceFilePath().getFileName().toString()));
-
- long extractFileLengthA = getFileLength(fpsA.getExtractFile());
- contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > NON_EXISTENT_FILE_LENGTH ?
- Long.toString(extractFileLengthA) : "");
-
- long extractFileLengthB = getFileLength(fpsB.getExtractFile());
- contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > NON_EXISTENT_FILE_LENGTH ?
- Long.toString(extractFileLengthB) : "");
-
- writer.writeRow(COMPARISON_CONTAINERS, contData);
-
- if (extractExceptionA != null) {
- writeExtractException(EXTRACT_EXCEPTION_TABLE_A, containerID, fpsA.getRelativeSourceFilePath().toString(),
- extractExceptionA);
- }
- if (extractExceptionB != null) {
- writeExtractException(EXTRACT_EXCEPTION_TABLE_B, containerID, fpsB.getRelativeSourceFilePath().toString(),
- extractExceptionB);
- }
-
- if (metadataListA == null && metadataListB == null) {
- return;
- }
- List<Integer> numAttachmentsA = countAttachments(metadataListA);
- List<Integer> numAttachmentsB = countAttachments(metadataListB);
-
- String sharedDigestKey = findSharedDigestKey(metadataListA, metadataListB);
- Map<Class, Object> tokenStatsA = null;
- Map<Class, Object> tokenStatsB = null;
- //now get that metadata
- if (metadataListA != null) {
- for (int i = 0; i < metadataListA.size(); i++) {
- //the first file should have the same id as the container id
- String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
- Metadata metadataA = metadataListA.get(i);
- ContentTags contentTagsA = getContent(fpsA, metadataA);
- ContentTags contentTagsB = ContentTags.EMPTY_CONTENT_TAGS;
- Metadata metadataB = null;
-
- //TODO: shouldn't be fileA!!!!
- writeTagData(fileId, contentTagsA, TAGS_TABLE_A);
-
- writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
- writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
- int matchIndex = getMatch(i, sharedDigestKey,
- handledB, metadataListA, metadataListB);
-
- if (matchIndex > -1 && ! handledB.contains(matchIndex)) {
- metadataB = metadataListB.get(matchIndex);
- handledB.add(matchIndex);
- }
- if (metadataB != null) {
- contentTagsB = getContent(fpsB, metadataB);
- writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
- writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
- writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
- }
- writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
- //write content
- try {
- tokenStatsA = calcTextStats(contentTagsA);
- writeContentData(fileId, tokenStatsA, CONTENTS_TABLE_A);
- tokenStatsB = calcTextStats(contentTagsB);
- if (metadataB != null) {
- writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- if (metadataB != null) {
- TokenCounts tokenCountsA = (TokenCounts) tokenStatsA.get(BasicTokenCountStatsCalculator.class);
- TokenCounts tokenCountsB = (TokenCounts) tokenStatsB.get(BasicTokenCountStatsCalculator.class);
- //arbitrary decision...only run the comparisons if there are > 10 tokens total
- //We may want to bump that value a bit higher?
- //now run comparisons
- if (tokenCountsA.getTotalTokens()
- + tokenCountsB.getTotalTokens() > 10) {
- Map<Cols, String> data = new HashMap<>();
- data.put(Cols.ID, fileId);
-
- ContrastStatistics contrastStatistics =
- tokenContraster.calculateContrastStatistics(
- tokenCountsA,
- tokenCountsB);
-
- writeContrasts(data, contrastStatistics);
- writer.writeRow(CONTENT_COMPARISONS, data);
- }
- }
- }
- }
- //now try to get any Metadata objects in B
- //that haven't yet been handled.
- if (metadataListB != null) {
- for (int i = 0; i < metadataListB.size(); i++) {
- if (handledB.contains(i)) {
- continue;
- }
- Metadata metadataB = metadataListB.get(i);
- ContentTags contentTagsB = getContent(fpsB, metadataB);
- //the first file should have the same id as the container id
- String fileId = (i == 0) ? containerID : Integer.toString(ID.getAndIncrement());
- writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
- writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
- writeEmbeddedFilePathData(i, fileId, null, metadataB);
- writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
-
- //write content
- try {
- tokenStatsB = calcTextStats(contentTagsB);
- writeContentData(fileId, tokenStatsB, CONTENTS_TABLE_B);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- }
- }
-
- /**
- * Checks only the first item in each list. Returns the first
- * digest key shared by both, if it exists, null otherwise.
- * @param metadataListA
- * @param metadataListB
- * @return
- */
- private String findSharedDigestKey(List<Metadata> metadataListA, List<Metadata> metadataListB) {
- if (metadataListB == null || metadataListB.size() == 0) {
- return null;
- }
- Set<String> digestA = new HashSet<>();
- if (metadataListA != null) {
- for (String n : metadataListA.get(0).names()) {
- if (n.startsWith(DIGEST_KEY_PREFIX)) {
- digestA.add(n);
- }
- }
- }
- Metadata bMain = metadataListB.get(0);
- for (String n : bMain.names()) {
- if (digestA.contains(n)) {
- return n;
- }
- }
- return null;
- }
-
- private void writeEmbeddedFilePathData(int i, String fileId, Metadata mA, Metadata mB) {
- //container file, don't write anything
- if (i == 0) {
- return;
- }
- String pathA = null;
- String pathB = null;
- if (mA != null) {
- pathA = mA.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
- }
- if (mB != null) {
- pathB = mB.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
- }
- if (pathA != null) {
- Map<Cols, String> d = new HashMap<>();
- d.put(Cols.ID, fileId);
- d.put(Cols.EMBEDDED_FILE_PATH, pathA);
- try {
- writer.writeRow(EMBEDDED_FILE_PATH_TABLE_A, d);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- if (pathB != null &&
- (pathA == null || ! pathA.equals(pathB))) {
- Map<Cols, String> d = new HashMap<>();
- d.put(Cols.ID, fileId);
- d.put(Cols.EMBEDDED_FILE_PATH, pathB);
- try {
- writer.writeRow(EMBEDDED_FILE_PATH_TABLE_B, d);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- }
-
- private long getSourceFileLength(List<Metadata> metadataListA, List<Metadata> metadataListB) {
- long len = getSourceFileLength(metadataListA);
- if (len > NON_EXISTENT_FILE_LENGTH) {
- return len;
- }
- return getSourceFileLength(metadataListB);
- }
-
-
- /**
- * Try to find the matching metadata based on the AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH
- * If you can't find it, return -1;
- *
- * @param aIndex index for match in metadataListA
- * @param metadataListA
- * @param metadataListB
- * @return
- */
- private int getMatch(int aIndex, String sharedDigestKey, Set<Integer> handledB,
- List<Metadata> metadataListA,
- List<Metadata> metadataListB) {
- //TODO: could make this more robust
- if (metadataListB == null || metadataListB.size() == 0) {
- return -1;
- }
- //assume first is always the container file
- if (aIndex == 0) {
- return 0;
- }
-
- if (sharedDigestKey != null) {
- //first try to find matching digests
- //this does not elegantly handle multiple matching digests
- return findMatchingDigests(sharedDigestKey, handledB,
- metadataListA.get(aIndex), metadataListB);
- }
-
- //assume same embedded resource path. Not always true!
- Metadata thisMetadata = metadataListA.get(aIndex);
- String embeddedPath = thisMetadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
- if (embeddedPath != null) {
- for (int j = 0; j < metadataListB.size(); j++) {
- String thatEmbeddedPath = metadataListB.get(j).get(
- AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
- if (embeddedPath.equals(thatEmbeddedPath)) {
- return j;
- }
- }
- }
-
- //last resort, if lists are same size, guess the same index
- if (metadataListA.size() == metadataListB.size()) {
- //assume no rearrangments if lists are the same size
- return aIndex;
- }
- return -1;
- }
-
- private int findMatchingDigests(String sharedDigestKey,
- Set<Integer> handledB,
- Metadata metadata, List<Metadata> metadataListB) {
- String digestA = metadata.get(sharedDigestKey);
- if (digestA == null) {
- return -1;
- }
-
- for (int i = 0; i < metadataListB.size(); i++) {
- if (handledB.contains(i)) {
- continue;
- }
- Metadata mB = metadataListB.get(i);
- String digestB = mB.get(sharedDigestKey);
- if (digestA.equalsIgnoreCase(digestB)) {
- return i;
- }
- }
- return -1;
- }
-
- private void writeContrasts(Map<Cols, String> data, ContrastStatistics contrastStatistics) {
- writeContrastString(data, Cols.TOP_10_MORE_IN_A, contrastStatistics.getTopNMoreA());
- writeContrastString(data, Cols.TOP_10_MORE_IN_B, contrastStatistics.getTopNMoreB());
- writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, contrastStatistics.getTopNUniqueA());
- writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, contrastStatistics.getTopNUniqueB());
- data.put(Cols.OVERLAP, Double.toString(contrastStatistics.getOverlap()));
- data.put(Cols.DICE_COEFFICIENT, Double.toString(contrastStatistics.getDiceCoefficient()));
-
- }
-
- private void writeContrastString(Map<Cols, String> data, Cols col, TokenIntPair[] tokenIntPairs) {
-
- int i = 0;
- StringBuilder sb = new StringBuilder();
- for (TokenIntPair p : tokenIntPairs) {
- if (i++ > 0) {
- sb.append(" | ");
- }
- sb.append(p.getToken()).append(": ").append(p.getValue());
- }
- data.put(col, sb.toString());
- }
-}
\ No newline at end of file
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
deleted file mode 100644
index 5fad576..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.Types;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.eval.db.ColInfo;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.ExtractReader;
-import org.apache.tika.eval.io.ExtractReaderException;
-import org.apache.tika.eval.io.IDBWriter;
-import org.apache.tika.eval.util.ContentTags;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
-
-public class ExtractProfiler extends AbstractProfiler {
-
- static Options OPTIONS;
- static {
- //By the time this commandline is parsed, there should be both an extracts and an inputDir
- Option extracts = new Option("extracts", true, "directory for extract files");
- extracts.setRequired(true);
-
- Option inputDir = new Option("inputDir", true,
- "optional: directory for original binary input documents."+
- " If not specified, -extracts is crawled as is.");
-
- OPTIONS = new Options()
- .addOption(extracts)
- .addOption(inputDir)
- .addOption("bc", "optional: tika-batch config file")
- .addOption("numConsumers", true, "optional: number of consumer threads")
- .addOption(new Option("alterExtract", true,
- "for json-formatted extract files, " +
- "process full metadata list ('as_is'=default), " +
- "take just the first/container document ('first_only'), " +
- "concatenate all content into the first metadata item ('concatenate_content')"))
- .addOption("minExtractLength", true, "minimum extract length to process (in bytes)")
- .addOption("maxExtractLength", true, "maximum extract length to process (in bytes)")
- .addOption("db", true, "db file to which to write results")
- .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
- .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
- .addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
- .addOption("drop", false, "drop tables if they exist")
- .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
- .addOption("maxTokens", true, "maximum tokens to process, default=200000")
- .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000")
- .addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000")
- .addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result")
-
- ;
-
- }
-
- public static void USAGE() {
- HelpFormatter helpFormatter = new HelpFormatter();
- helpFormatter.printHelp(
- 80,
- "java -jar tika-eval-x.y.jar Profile -extracts extracts -db mydb [-inputDir input]",
- "Tool: Profile",
- ExtractProfiler.OPTIONS,
- "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
- }
-
- private final static String FIELD = "f";
-
- public static TableInfo EXTRACT_EXCEPTION_TABLE = new TableInfo("extract_exceptions",
- new ColInfo(Cols.CONTAINER_ID, Types.INTEGER),
- new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
- new ColInfo(Cols.EXTRACT_EXCEPTION_ID, Types.INTEGER),
- new ColInfo(Cols.PARSE_ERROR_ID, Types.INTEGER)
- );
-
- public static TableInfo EXCEPTION_TABLE = new TableInfo("parse_exceptions",
- new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
- new ColInfo(Cols.ORIG_STACK_TRACE, Types.VARCHAR, 8192),
- new ColInfo(Cols.SORT_STACK_TRACE, Types.VARCHAR, 8192),
- new ColInfo(Cols.PARSE_EXCEPTION_ID, Types.INTEGER)
- );
-
-
- public static TableInfo CONTAINER_TABLE = new TableInfo("containers",
- new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"),
- new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
- new ColInfo(Cols.LENGTH, Types.BIGINT),
- new ColInfo(Cols.EXTRACT_FILE_LENGTH, Types.BIGINT)
- );
-
- public static TableInfo PROFILE_TABLE = new TableInfo("profiles",
- new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
- new ColInfo(Cols.CONTAINER_ID, Types.INTEGER),
- new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 256),
- new ColInfo(Cols.MD5, Types.CHAR, 32),
- new ColInfo(Cols.LENGTH, Types.BIGINT),
- new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN),
- new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
- new ColInfo(Cols.MIME_ID, Types.INTEGER),
- new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER),
- new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER),
- new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER),
- new ColInfo(Cols.NUM_PAGES, Types.INTEGER),
- new ColInfo(Cols.HAS_CONTENT, Types.BOOLEAN)
- );
-
- public static TableInfo EMBEDDED_FILE_PATH_TABLE = new TableInfo("emb_file_names",
- new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
- new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024)
- );
-
- public static TableInfo CONTENTS_TABLE = new TableInfo("contents",
- new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
- new ColInfo(Cols.CONTENT_LENGTH, Types.INTEGER),
- new ColInfo(Cols.NUM_UNIQUE_TOKENS, Types.INTEGER),
- new ColInfo(Cols.NUM_TOKENS, Types.INTEGER),
- new ColInfo(Cols.COMMON_TOKENS_LANG, Types.VARCHAR, 12),
- new ColInfo(Cols.NUM_UNIQUE_COMMON_TOKENS, Types.INTEGER),
- new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER),
- new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER),
- new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER),
- new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024),
- new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12),
- new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT),
- new ColInfo(Cols.LANG_ID_2, Types.VARCHAR, 12),
- new ColInfo(Cols.LANG_ID_PROB_2, Types.FLOAT),
- new ColInfo(Cols.UNICODE_CHAR_BLOCKS, Types.VARCHAR, 1024),
- new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT),
- new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER),
- new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT),
- new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT),
- new ColInfo(Cols.CONTENT_TRUNCATED_AT_MAX_LEN, Types.BOOLEAN)
- );
-
- public static TableInfo TAGS_TABLE = new TableInfo("tags",
- new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
- new ColInfo(Cols.TAGS_A, Types.INTEGER),
- new ColInfo(Cols.TAGS_B, Types.INTEGER),
- new ColInfo(Cols.TAGS_DIV, Types.INTEGER),
- new ColInfo(Cols.TAGS_I, Types.INTEGER),
- new ColInfo(Cols.TAGS_IMG, Types.INTEGER),
- new ColInfo(Cols.TAGS_LI, Types.INTEGER),
- new ColInfo(Cols.TAGS_OL, Types.INTEGER),
- new ColInfo(Cols.TAGS_P, Types.INTEGER),
- new ColInfo(Cols.TAGS_TABLE, Types.INTEGER),
- new ColInfo(Cols.TAGS_TD, Types.INTEGER),
- new ColInfo(Cols.TAGS_TITLE, Types.INTEGER),
- new ColInfo(Cols.TAGS_TR, Types.INTEGER),
- new ColInfo(Cols.TAGS_U, Types.INTEGER),
- new ColInfo(Cols.TAGS_UL, Types.INTEGER),
- new ColInfo(Cols.TAGS_PARSE_EXCEPTION, Types.BOOLEAN)
- );
-
- private final Path inputDir;
- private final Path extracts;
- private final ExtractReader extractReader;
-
- public ExtractProfiler(ArrayBlockingQueue<FileResource> queue,
- Path inputDir, Path extracts,
- ExtractReader extractReader, IDBWriter dbWriter) {
- super(queue, dbWriter);
- this.inputDir = inputDir;
- this.extracts = extracts;
- this.extractReader = extractReader;
- }
-
- @Override
- public boolean processFileResource(FileResource fileResource) {
- Metadata metadata = fileResource.getMetadata();
- EvalFilePaths fps = null;
-
- if (inputDir != null && inputDir.equals(extracts)) {
- //crawling an extract dir
- fps = getPathsFromExtractCrawl(metadata, extracts);
- } else {
- fps = getPathsFromSrcCrawl(metadata, inputDir, extracts);
- }
- int containerId = ID.incrementAndGet();
- String containerIdString = Integer.toString(containerId);
-
- ExtractReaderException.TYPE extractExceptionType = null;
-
- List<Metadata> metadataList = null;
- try {
- metadataList = extractReader.loadExtract(fps.getExtractFile());
- } catch (ExtractReaderException e) {
- extractExceptionType = e.getType();
- }
-
- Map<Cols, String> contOutput = new HashMap<>();
- Long srcFileLen = getSourceFileLength(fps, metadataList);
- contOutput.put(Cols.LENGTH,
- srcFileLen > NON_EXISTENT_FILE_LENGTH ?
- Long.toString(srcFileLen): "");
- contOutput.put(Cols.CONTAINER_ID, containerIdString);
- contOutput.put(Cols.FILE_PATH, fps.getRelativeSourceFilePath().toString());
-
- if (fps.getExtractFileLength() > 0) {
- contOutput.put(Cols.EXTRACT_FILE_LENGTH,
- (fps.getExtractFile() == null) ?
- "" :
- Long.toString(fps.getExtractFileLength()));
- }
- try {
- writer.writeRow(CONTAINER_TABLE, contOutput);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
-
- if (extractExceptionType != null) {
- try {
- writeExtractException(EXTRACT_EXCEPTION_TABLE, containerIdString,
- fps.getRelativeSourceFilePath().toString(), extractExceptionType);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- return true;
- }
-
- List<Integer> numAttachments = countAttachments(metadataList);
- int i = 0;
- for (Metadata m : metadataList) {
- ContentTags contentTags = getContent(fps, m);
- //the first file should have the same id as the container id
- String fileId = (i == 0) ? containerIdString : Integer.toString(ID.incrementAndGet());
- writeTagData(fileId, contentTags, TAGS_TABLE);
- writeProfileData(fps, i, contentTags, m, fileId, containerIdString, numAttachments, PROFILE_TABLE);
- writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE);
- writeExceptionData(fileId, m, EXCEPTION_TABLE);
- try {
- Map<Class, Object> textStats = calcTextStats(contentTags);
- writeContentData(fileId, textStats, CONTENTS_TABLE);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- i++;
- }
- return true;
- }
-
-
- private void writeEmbeddedPathData(int i, String fileId, Metadata m,
- TableInfo embeddedFilePathTable) {
- if (i == 0) {
- return;
- }
- Map<Cols, String> data = new HashMap<>();
- data.put(Cols.ID, fileId);
- data.put(Cols.EMBEDDED_FILE_PATH,
- m.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH));
- try {
- writer.writeRow(embeddedFilePathTable, data);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
deleted file mode 100644
index 65908b7..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval;
-
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.commons.io.FilenameUtils;
-import org.apache.tika.Tika;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.fs.FSProperties;
-import org.apache.tika.detect.FileCommandDetector;
-import org.apache.tika.eval.db.ColInfo;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.IDBWriter;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.Types;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-
-/**
- * This class profiles actual files as opposed to extracts e.g. {@link ExtractProfiler}.
- * This does _not_ parse files, but does run file type identification and digests the
- * raw bytes.
- *
- * If the 'file' command is available on the command line, this will also run the
- * FileCommandDetector.
- */
-
-public class FileProfiler extends AbstractProfiler {
-//TODO: we should allow users to select digest type/encoding and file detector(s).
-
- private static final boolean HAS_FILE = FileCommandDetector.checkHasFile();
- private static final Logger LOG = LoggerFactory.getLogger(FileProfiler.class);
-
- static Options OPTIONS;
- static {
-
- Option inputDir = new Option("inputDir", true,
- "optional: directory for original binary input documents."+
- " If not specified, -extracts is crawled as is.");
-
- OPTIONS = new Options()
- .addOption(inputDir)
- .addOption("bc", "optional: tika-batch config file")
- .addOption("numConsumers", true, "optional: number of consumer threads")
- .addOption("db", true, "db file to which to write results")
- .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>")
- .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver")
- .addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
- .addOption("drop", false, "drop tables if they exist")
- .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
-
- ;
-
- }
-
- public static void USAGE() {
- HelpFormatter helpFormatter = new HelpFormatter();
- helpFormatter.printHelp(
- 80,
- "java -jar tika-eval-x.y.jar FileProfiler -inputDir docs -db mydb [-inputDir input]",
- "Tool: Profile",
- FileProfiler.OPTIONS,
- "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
- }
-
-
-
- public static TableInfo FILE_PROFILES = HAS_FILE ?
- new TableInfo("file_profiles",
- new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
- new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
- new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
- new ColInfo(Cols.LENGTH, Types.BIGINT),
- new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
- new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER),
- new ColInfo(Cols.FILE_MIME_ID, Types.INTEGER))
- :
- new TableInfo("file_profiles",
- new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
- new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
- new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
- new ColInfo(Cols.LENGTH, Types.BIGINT),
- new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
- new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER));
-
-
- public static TableInfo FILE_MIME_TABLE = new TableInfo("file_mimes",
- new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
- new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
- new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
- );
-
- public static final String DETECT_EXCEPTION = "detect-exception";
- private static final Tika TIKA = new Tika();
-
- private static final FileCommandDetector FILE_COMMAND_DETECTOR = new FileCommandDetector();
- private final Path inputDir;
-
- public FileProfiler(ArrayBlockingQueue<FileResource> fileQueue, Path inputDir, IDBWriter dbWriter) {
- super(fileQueue, dbWriter);
- this.inputDir = inputDir;
- }
-
-
- @Override
- public boolean processFileResource(FileResource fileResource) {
- String relPath = fileResource.getMetadata().get(FSProperties.FS_REL_PATH);
- try (InputStream is = fileResource.openInputStream()) {
- try (TikaInputStream tis = TikaInputStream.get(is)) {
- Path path = tis.getPath();
- Map<Cols, String> data = new HashMap<>();
- int tikaMimeId = writer.getMimeId(detectTika(tis));
- String fileName = "";
- String extension = "";
- long length = -1;
- try {
- fileName = FilenameUtils.getName(relPath);
- } catch (IllegalArgumentException e) {
- LOG.warn("bad file name: "+relPath, e);
- }
-
- try {
- extension = FilenameUtils.getExtension(relPath);
- } catch (IllegalArgumentException e) {
- LOG.warn("bad extension: "+relPath, e);
- }
-
- try {
- length = Files.size(path);
- } catch (IOException e) {
- LOG.warn("problem getting size: "+relPath, e);
- }
-
- data.put(Cols.FILE_PATH, relPath);
- data.put(Cols.FILE_NAME, fileName);
- data.put(Cols.FILE_EXTENSION, extension);
- data.put(Cols.LENGTH, Long.toString(length));
- data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
- data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
- if (HAS_FILE) {
- int fileMimeId = writer.getMimeId(detectFile(tis));
- data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId));
- }
- writer.writeRow(FILE_PROFILES, data);
- }
- } catch (IOException e) {
- //log at least!
- return false;
- }
- return true;
- }
-
- private String detectFile(TikaInputStream tis) {
- try {
- return FILE_COMMAND_DETECTOR.detect(tis, new Metadata()).toString();
- } catch (IOException e) {
- return DETECT_EXCEPTION;
- }
- }
-
- private String detectTika(TikaInputStream tis) {
- try {
- return TIKA.detect(tis);
- } catch (IOException e) {
- return DETECT_EXCEPTION;
- }
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
deleted file mode 100644
index d678c2b..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval;
-
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.DefaultParser;
-import org.apache.commons.cli.ParseException;
-import org.apache.tika.batch.fs.FSBatchProcessCLI;
-import org.apache.tika.eval.reports.ResultsReporter;
-import org.h2.tools.Console;
-
-public class TikaEvalCLI {
- static final String[] tools = {"Profile", "FileProfile",
- "Compare", "Report", "StartDB"};
-
- private static String specifyTools() {
- StringBuilder sb = new StringBuilder();
- sb.append("Must specify one of the following tools in the first parameter:\n");
- for (String s : tools) {
- sb.append(s+"\n");
- }
- return sb.toString();
-
- }
-
- private void execute(String[] args) throws Exception {
- String tool = args[0];
- String[] subsetArgs = new String[args.length-1];
- System.arraycopy(args, 1, subsetArgs, 0, args.length - 1);
- if (tool.equals("Report")) {
- handleReport(subsetArgs);
- } else if (tool.equals("Compare")) {
- handleCompare(subsetArgs);
- } else if (tool.equals("Profile")) {
- handleProfile(subsetArgs);
- } else if (tool.equals("StartDB")) {
- handleStartDB(subsetArgs);
- } else if (tool.equals("FileProfile")){
- handleProfileFiles(subsetArgs);
- } else {
- System.out.println(specifyTools());
- }
- }
-
- private void handleProfileFiles(String[] subsetArgs) throws Exception {
- List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
-
- boolean containsBC = false;
- String inputDir = null;
- //confirm there's a batch-config file
- for (int i = 0; i < argList.size(); i++) {
- String arg = argList.get(i);
- if (arg.equals("-bc")) {
- containsBC = true;
- }
- }
-
- Path tmpBCConfig = null;
- try {
- tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
- if (! containsBC) {
- try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-file-profiler-config.xml")) {
- Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
- }
- argList.add("-bc");
- argList.add(tmpBCConfig.toAbsolutePath().toString());
- }
-
- String[] updatedArgs = argList.toArray(new String[argList.size()]);
- DefaultParser defaultCLIParser = new DefaultParser();
- try {
- CommandLine commandLine = defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs);
- if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
- System.out.println("Please specify either the default -db or the full -jdbc, not both");
- ExtractProfiler.USAGE();
- return;
- }
- } catch (ParseException e) {
- System.out.println(e.getMessage()+"\n");
- FileProfiler.USAGE();
- return;
- }
-
- FSBatchProcessCLI.main(updatedArgs);
- } finally {
- if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
- Files.delete(tmpBCConfig);
- }
- }
- }
-
- private void handleStartDB(String[] args) throws SQLException {
- List<String> argList = new ArrayList<>();
- argList.add("-web");
- Console.main(argList.toArray(new String[0]));
- while(true) {
- try {
- Thread.sleep(1000);
- } catch (InterruptedException e){
- break;
- }
- }
- }
-
- private void handleProfile(String[] subsetArgs) throws Exception {
- List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
-
- boolean containsBC = false;
- String inputDir = null;
- String extracts = null;
- String alterExtract = null;
- //confirm there's a batch-config file
- for (int i = 0; i < argList.size(); i++) {
- String arg = argList.get(i);
- if (arg.equals("-bc")) {
- containsBC = true;
- } else if (arg.equals("-inputDir")) {
- if (i+1 >= argList.size()) {
- System.err.println("Must specify directory after -inputDir");
- ExtractProfiler.USAGE();
- return;
- }
- inputDir = argList.get(i+1);
- i++;
- } else if (arg.equals("-extracts")) {
- if (i+1 >= argList.size()) {
- System.err.println("Must specify directory after -extracts");
- ExtractProfiler.USAGE();
- return;
- }
- extracts = argList.get(i+1);
- i++;
- } else if (arg.equals("-alterExtract")) {
- if (i+1 >= argList.size()) {
- System.err.println("Must specify type 'as_is', 'first_only' or " +
- "'concatenate_content' after -alterExtract");
- ExtractComparer.USAGE();
- return;
- }
- alterExtract = argList.get(i+1);
- i++;
- }
- }
-
- if (alterExtract != null && !alterExtract.equals("as_is") &&
- !alterExtract.equals("concatenate_content") &&
- !alterExtract.equals("first_only")) {
- System.out.println("Sorry, I don't understand:"+alterExtract+
- ". The values must be one of: as_is, first_only, concatenate_content");
- ExtractProfiler.USAGE();
- return;
- }
-
- //need to specify each in this commandline
- //if only extracts is passed to tika-batch,
- //the crawler will see no inputDir and start crawling "input".
- //this allows the user to specify either extracts or inputDir
- if (extracts == null && inputDir != null) {
- argList.add("-extracts");
- argList.add(inputDir);
- } else if (inputDir == null && extracts != null) {
- argList.add("-inputDir");
- argList.add(extracts);
- }
-
- Path tmpBCConfig = null;
- try {
- tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
- if (! containsBC) {
- try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-profiler-config.xml")) {
- Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
- }
- argList.add("-bc");
- argList.add(tmpBCConfig.toAbsolutePath().toString());
- }
-
- String[] updatedArgs = argList.toArray(new String[0]);
- DefaultParser defaultCLIParser = new DefaultParser();
- try {
- CommandLine commandLine = defaultCLIParser.parse(ExtractProfiler.OPTIONS, updatedArgs);
- if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
- System.out.println("Please specify either the default -db or the full -jdbc, not both");
- ExtractProfiler.USAGE();
- return;
- }
- } catch (ParseException e) {
- System.out.println(e.getMessage()+"\n");
- ExtractProfiler.USAGE();
- return;
- }
-
- FSBatchProcessCLI.main(updatedArgs);
- } finally {
- if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
- Files.delete(tmpBCConfig);
- }
- }
- }
-
- private void handleCompare(String[] subsetArgs) throws Exception{
- List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
-
- boolean containsBC = false;
- String inputDir = null;
- String extractsA = null;
- String alterExtract = null;
- //confirm there's a batch-config file
- for (int i = 0; i < argList.size(); i++) {
- String arg = argList.get(i);
- if (arg.equals("-bc")) {
- containsBC = true;
- } else if (arg.equals("-inputDir")) {
- if (i+1 >= argList.size()) {
- System.err.println("Must specify directory after -inputDir");
- ExtractComparer.USAGE();
- return;
- }
- inputDir = argList.get(i+1);
- i++;
- } else if (arg.equals("-extractsA")) {
- if (i+1 >= argList.size()) {
- System.err.println("Must specify directory after -extractsA");
- ExtractComparer.USAGE();
- return;
- }
- extractsA = argList.get(i+1);
- i++;
- } else if (arg.equals("-alterExtract")) {
- if (i+1 >= argList.size()) {
- System.err.println("Must specify type 'as_is', 'first_only' or " +
- "'concatenate_content' after -alterExtract");
- ExtractComparer.USAGE();
- return;
- }
- alterExtract = argList.get(i+1);
- i++;
- }
- }
- if (alterExtract != null && !alterExtract.equals("as_is") &&
- !alterExtract.equals("concatenate_content") &&
- !alterExtract.equals("first_only")) {
- System.out.println("Sorry, I don't understand:"+alterExtract+
- ". The values must be one of: as_is, first_only, concatenate_content");
- ExtractComparer.USAGE();
- return;
- }
-
- //need to specify each in the commandline that goes into tika-batch
- //if only extracts is passed to tika-batch,
- //the crawler will see no inputDir and start crawling "input".
- //if the user doesn't specify inputDir, crawl extractsA
- if (inputDir == null && extractsA != null) {
- argList.add("-inputDir");
- argList.add(extractsA);
- }
-
- Path tmpBCConfig = null;
- try {
- tmpBCConfig = Files.createTempFile("tika-eval", ".xml");
- if (! containsBC) {
- try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-comparison-config.xml")) {
- Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
- }
- argList.add("-bc");
- argList.add(tmpBCConfig.toAbsolutePath().toString());
-
- }
- String[] updatedArgs = argList.toArray(new String[0]);
- DefaultParser defaultCLIParser = new DefaultParser();
- try {
- CommandLine commandLine = defaultCLIParser.parse(ExtractComparer.OPTIONS, updatedArgs);
- if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) {
- System.out.println("Please specify either the default -db or the full -jdbc, not both");
- ExtractComparer.USAGE();
- return;
- }
- } catch (ParseException e) {
- System.out.println(e.getMessage()+"\n");
- ExtractComparer.USAGE();
- return;
- }
-
- FSBatchProcessCLI.main(updatedArgs);
- } finally {
- if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
- Files.delete(tmpBCConfig);
- }
- }
- }
-
- private void handleReport(String[] subsetArgs) throws Exception {
- ResultsReporter.main(subsetArgs);
- }
-
- public static void main(String[] args) throws Exception {
- TikaEvalCLI cli = new TikaEvalCLI();
- if (args.length == 0) {
- System.err.println(specifyTools());
- return;
- }
- cli.execute(args);
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
deleted file mode 100644
index 9ce25c2..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval;
-
-
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.stream.XMLStreamConstants;
-import javax.xml.stream.XMLStreamException;
-import javax.xml.stream.XMLStreamReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringReader;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.sql.Connection;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.log4j.Level;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.H2Util;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.XMLLogMsgHandler;
-import org.apache.tika.eval.io.XMLLogReader;
-import org.apache.tika.eval.reports.ResultsReporter;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This is a very task specific class that reads a log file and updates
- * the "comparisons" table. It should not be run in a multithreaded environment.
- */
-public class XMLErrorLogUpdater {
- private static final Logger LOG = LoggerFactory.getLogger(ResultsReporter.class);
-
- private Statement statement;
-
- public static void main(String[] args) throws Exception {
- XMLErrorLogUpdater writer = new XMLErrorLogUpdater();
- Path xmlLogFileA = Paths.get(args[0]);
- Path xmlLogFileB = Paths.get(args[1]);
- Path db = Paths.get(args[2]);
- JDBCUtil dbUtil = new H2Util(db);
- Connection connection = dbUtil.getConnection();
- writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, xmlLogFileA);
- writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, xmlLogFileB);
- connection.commit();
- }
-
- public void update(Connection connection, TableInfo tableInfo, Path xmlLogFile) throws Exception {
- statement = connection.createStatement();
- XMLLogReader reader = new XMLLogReader();
- try (InputStream is = Files.newInputStream(xmlLogFile)) {
- reader.read(is, new ErrorMsgUpdater(tableInfo.getName()));
- } catch (IOException e) {
- throw new RuntimeException("Problem reading: "+xmlLogFile.toAbsolutePath().toString());
- } finally {
- try {
- connection.commit();
- statement.close();
- } catch (SQLException e) {
- throw new RuntimeException("Failed to close db connection!", e);
- }
- }
- }
-
- private class ErrorMsgUpdater implements XMLLogMsgHandler {
- private final String errorTablename;
-
- private ErrorMsgUpdater(String errorTablename) {
- this.errorTablename = errorTablename;
- }
-
- @Override
- public void handleMsg(Level level, String xml) throws SQLException, IOException {
- if (! level.equals(Level.ERROR)) {
- return;
- }
- XMLStreamReader reader = null;
- try {
- reader = XMLInputFactory.newInstance().createXMLStreamReader(new StringReader(xml));
- } catch (XMLStreamException e) {
- throw new IOExceptionWithCause(e);
- }
- String type = null;
- String resourceId = null;
- try {
- while (reader.hasNext() && type == null && resourceId == null) {
- reader.next();
- switch (reader.getEventType()) {
- case XMLStreamConstants.START_ELEMENT:
- if ("timed_out".equals(reader.getLocalName())) {
- resourceId = reader.getAttributeValue("", "resourceId");
- update(errorTablename, resourceId,
- AbstractProfiler.PARSE_ERROR_TYPE.TIMEOUT);
-
- } else if ("oom".equals(reader.getLocalName())) {
- resourceId = reader.getAttributeValue("", "resourceId");
- update(errorTablename, resourceId, AbstractProfiler.PARSE_ERROR_TYPE.OOM);
- }
- break;
- }
- }
- reader.close();
- } catch (XMLStreamException e) {
- throw new IOExceptionWithCause(e);
- }
- }
-
- private void update(String errorTableName,
- String filePath, AbstractProfiler.PARSE_ERROR_TYPE type) throws SQLException {
- int containerId = getContainerId(filePath);
- String sql = "SELECT count(1) from "+errorTableName +
- " where "+Cols.CONTAINER_ID +
- " = "+containerId + " or "+
- Cols.FILE_PATH + "='"+filePath+"'";
- ResultSet rs = statement.executeQuery(sql);
-
- //now try to figure out if that file already exists
- //in parse errors
- int hitCount = 0;
- while (rs.next()) {
- hitCount = rs.getInt(1);
- }
-
- //if it does, update all records matching that path or container id
- if (hitCount > 0) {
- sql = "UPDATE " + errorTableName +
- " SET " + Cols.PARSE_ERROR_ID +
- " = " + type.ordinal() + ","+
- Cols.FILE_PATH + "='" +filePath+"'"+
- " where "+Cols.CONTAINER_ID +
- "="+containerId + " or "+
- Cols.FILE_PATH + "='"+filePath+"'";;
-
- } else {
- //if not and container id > -1
- //insert full record
- if (containerId > -1) {
- sql = "INSERT INTO " + errorTableName +
- " ("+Cols.CONTAINER_ID+","+Cols.FILE_PATH +","+Cols.PARSE_ERROR_ID +")"+
- " values (" + containerId + ", '" + filePath + "'," +
- type.ordinal() + ");";
- } else {
- //if container id == -1, insert only file path and parse error type id
- sql = "INSERT INTO " + errorTableName +
- " ("+Cols.FILE_PATH.name()+","+Cols.PARSE_ERROR_ID +")"+
- "values ('" + filePath + "'," +
- type.ordinal() + ");";
- }
-
- }
- int updated = statement.executeUpdate(sql);
- if (updated == 0) {
- //TODO: log
- LOG.warn("made no updates in xmlerrorlogupdater!");
- } else if (updated > 1) {
- LOG.warn("made too many updates");
- }
- }
-
- private int getContainerId(String resourceId) throws SQLException {
- int containerId = -1;
- String sql = "SELECT " + Cols.CONTAINER_ID.name() +
- " from " + ExtractProfiler.CONTAINER_TABLE.getName()+
- " where " + Cols.FILE_PATH +
- " ='"+resourceId+"'";
- ResultSet rs = statement.executeQuery(sql);
- int resultCount = 0;
- while (rs.next()) {
- containerId = rs.getInt(1);
- resultCount++;
- }
- rs.close();
-
- if (resultCount == 0) {
- LOG.warn("Should have found a container for: {}", resourceId);
- } else if (resultCount > 1) {
- LOG.error("Records ids should be unique: {}", resourceId);
- }
-/*
- if (containerId < 0) {
- System.err.println("CONTAINER ID < 0!!!");
- sql = "SELECT MAX("+ Cols.CONTAINER_ID.name() +
- ") from "+ExtractProfiler.CONTAINER_TABLE.getName();
- rs = statement.executeQuery(sql);
- while (rs.next()) {
- containerId = rs.getInt(1);
- }
- rs.close();
- if (containerId < 0) {
- //log and abort
- //return -1?
- } else {
- containerId++;
- }
-
- }*/
- return containerId;
- }
-
-
- }
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java
deleted file mode 100644
index 2e3dfca..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.batch;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.Connection;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.tika.batch.ConsumersManager;
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.eval.XMLErrorLogUpdater;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.eval.db.MimeBuffer;
-import org.apache.tika.eval.db.TableInfo;
-
-
-public class DBConsumersManager extends ConsumersManager {
-
- private final Connection conn;
- private final MimeBuffer mimeBuffer;
- private final List<LogTablePair> errorLogs = new ArrayList<>();
-
- public DBConsumersManager(JDBCUtil dbUtil, MimeBuffer mimeBuffer, List<FileResourceConsumer> consumers)
- throws SQLException {
- super(consumers);
- this.conn = dbUtil.getConnection();
- this.mimeBuffer = mimeBuffer;
- }
-
-
- @Override
- public void shutdown() {
-
- for (FileResourceConsumer consumer : getConsumers()) {
- if (consumer instanceof AbstractProfiler) {
- try{
- ((AbstractProfiler)consumer).closeWriter();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- }
-
- try {
- mimeBuffer.close();
- } catch (SQLException e) {
- throw new RuntimeException(e);
- }
-
- //MUST HAPPEN AFTER consumers have closed and
- //committed container information!!!
- XMLErrorLogUpdater up = new XMLErrorLogUpdater();
- for (LogTablePair p : errorLogs) {
- try {
- up.update(conn, p.tableInfo, p.log);
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
-
-
- try {
- conn.commit();
- } catch (SQLException e) {
- throw new RuntimeException(e);
- }
- try {
- conn.close();
- } catch (SQLException e) {
- throw new RuntimeException(e);
- }
- }
-
- public void addErrorLogTablePair(Path log, TableInfo tableInfo) {
- LogTablePair p = new LogTablePair();
- p.log = log;
- p.tableInfo = tableInfo;
- errorLogs.add(p);
- }
-
- class LogTablePair {
- Path log;
- TableInfo tableInfo;
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
deleted file mode 100644
index 9953973..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval.batch;
-
-import java.io.IOException;
-import java.sql.Connection;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.eval.db.MimeBuffer;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.eval.io.DBWriter;
-import org.apache.tika.eval.io.ExtractReader;
-import org.apache.tika.eval.io.ExtractReaderException;
-import org.apache.tika.eval.io.IDBWriter;
-import org.apache.tika.util.PropsUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public abstract class EvalConsumerBuilder {
-
- private static final Logger LOG = LoggerFactory.getLogger(EvalConsumerBuilder.class);
-
- protected ArrayBlockingQueue<FileResource> queue;
- Map<String, String> localAttrs;
- JDBCUtil dbUtil;
- private MimeBuffer mimeBuffer;
- AtomicInteger initialized = new AtomicInteger(0);
-
- public MimeBuffer init(ArrayBlockingQueue<FileResource> queue, Map<String, String> localAttrs,
- JDBCUtil dbUtil, boolean forceDrop) throws IOException, SQLException {
- if (initialized.getAndIncrement() > 0) {
- throw new RuntimeException("Can only init a consumer builder once!");
- }
- this.queue = queue;
- this.localAttrs = localAttrs;
- this.dbUtil = dbUtil;
- //the order of the following is critical
- //step 1. update the table names with prefixes
- updateTableInfosWithPrefixes(localAttrs);
-
- JDBCUtil.CREATE_TABLE createRegularTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS :
- JDBCUtil.CREATE_TABLE.THROW_EX_IF_EXISTS;
-
- JDBCUtil.CREATE_TABLE createRefTable = (forceDrop) ? JDBCUtil.CREATE_TABLE.DROP_IF_EXISTS :
- JDBCUtil.CREATE_TABLE.SKIP_IF_EXISTS;
-
- //step 2. create the tables
- dbUtil.createTables(getNonRefTableInfos(), createRegularTable);
- dbUtil.createTables(getRefTableInfos(), createRefTable);
-
- //step 3. create mime buffer
- this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), getMimeTable(),
- TikaConfig.getDefaultConfig());
-
- //step 4. populate the reference tables
- populateRefTables();
-
- return mimeBuffer;
- }
-
- public abstract FileResourceConsumer build() throws IOException, SQLException;
-
- protected abstract void updateTableInfosWithPrefixes(Map<String, String> attrs);
-
- /**
- *
- * @return only the ref tables
- */
- protected abstract List<TableInfo> getRefTableInfos();
-
- /**
- *
- * @return the main tables, not including the ref tables
- */
- protected abstract List<TableInfo> getNonRefTableInfos();
-
- protected abstract TableInfo getMimeTable();
-
- protected abstract void addErrorLogTablePairs(DBConsumersManager manager);
-
- public void populateRefTables() throws IOException, SQLException {
- boolean refTablesPopulated = true;
- try{
- Connection connection = dbUtil.getConnection();
- for (TableInfo tableInfo : getRefTableInfos()) {
- int rows = 0;
- try (ResultSet rs = connection.createStatement().executeQuery("select * from "+
- tableInfo.getName())) {
- while (rs.next()) {
- rows++;
- }
- }
- if (rows == 0) {
- refTablesPopulated = false;
- break;
- }
-
- }
- } catch (SQLException e) {
- //swallow
- }
- if (refTablesPopulated) {
- LOG.info("ref tables are already populated");
- return;
- }
-
- IDBWriter writer = getDBWriter(getRefTableInfos());
- Map<Cols, String> m = new HashMap<>();
- for (AbstractProfiler.PARSE_ERROR_TYPE t : AbstractProfiler.PARSE_ERROR_TYPE.values()) {
- m.clear();
- m.put(Cols.PARSE_ERROR_ID, Integer.toString(t.ordinal()));
- m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
- writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
- }
-
- for (AbstractProfiler.EXCEPTION_TYPE t : AbstractProfiler.EXCEPTION_TYPE.values()) {
- m.clear();
- m.put(Cols.PARSE_EXCEPTION_ID, Integer.toString(t.ordinal()));
- m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
- writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
- }
-
- for (ExtractReaderException.TYPE t :
- ExtractReaderException.TYPE.values()) {
- m.clear();
- m.put(Cols.EXTRACT_EXCEPTION_ID, Integer.toString(t.ordinal()));
- m.put(Cols.EXTRACT_EXCEPTION_DESCRIPTION, t.name());
- writer.writeRow(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES, m);
- }
- writer.close();
- }
-
- protected IDBWriter getDBWriter(List<TableInfo> tableInfos) throws IOException, SQLException {
- Connection conn = dbUtil.getConnection();
- return new DBWriter(conn, tableInfos, dbUtil, mimeBuffer);
- }
-
- ExtractReader.ALTER_METADATA_LIST getAlterMetadata(Map<String, String> localAttrs) {
-
- String alterExtractString = localAttrs.get("alterExtract");
- ExtractReader.ALTER_METADATA_LIST alterExtractList;
- if (alterExtractString == null || alterExtractString.equalsIgnoreCase("as_is")) {
- alterExtractList = ExtractReader.ALTER_METADATA_LIST.AS_IS;
- } else if (alterExtractString.equalsIgnoreCase("first_only")) {
- alterExtractList = ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY;
- } else if (alterExtractString.equalsIgnoreCase("concatenate_content")) {
- alterExtractList = ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST;
- } else {
- throw new RuntimeException("options for alterExtract: as_is, first_only, concatenate_content." +
- " I don't understand:" + alterExtractString);
- }
- return alterExtractList;
- }
-
- protected ExtractReader buildExtractReader(Map<String, String> localAttrs) {
- long minExtractLength = PropsUtil.getLong(localAttrs.get("minExtractLength"), -1L);
- long maxExtractLength = PropsUtil.getLong(localAttrs.get("maxExtractLength"), -1L);
-
- ExtractReader.ALTER_METADATA_LIST alterExtractList = getAlterMetadata(localAttrs);
- return new ExtractReader(alterExtractList, minExtractLength, maxExtractLength);
- }
-
- FileResourceConsumer parameterizeProfiler(AbstractProfiler abstractProfiler) {
-
- int maxContentLength = PropsUtil.getInt(localAttrs.get("maxContentLength"), -2);
- if (maxContentLength > -2) {
- abstractProfiler.setMaxContentLength(maxContentLength);
- }
-
- int maxContentLengthForLangId = PropsUtil.getInt(localAttrs.get("maxContentLengthForLangId"), -2);
- if (maxContentLengthForLangId > -2) {
- abstractProfiler.setMaxContentLengthForLangId(maxContentLengthForLangId);
- }
-
- int maxTokens = PropsUtil.getInt(localAttrs.get("maxTokens"), -2);
- if (maxTokens > -2) {
- abstractProfiler.setMaxTokens(maxTokens);
- }
-
-
- return abstractProfiler;
- }
-
-
-/*
- public abstract Map<String, String> getIndexInfo();
-
- class ValueComparator implements Comparator<String> {
-
- Map<String, ColInfo> map;
-
- public ValueComparator(Map<String, ColInfo> base) {
- this.map = base;
- }
-
- public int compare(String a, String b) {
- Integer aVal = map.get(a).getDBColOffset();
- Integer bVal = map.get(b).getDBColOffset();
- if (aVal == null || bVal == null) {
- throw new IllegalArgumentException("Column offset must be specified!");
- }
- if (aVal == bVal && ! map.get(a).equals(map.get(b))) {
- throw new IllegalArgumentException("Column offsets must be unique: " + a + " and " + b + " both have: "+aVal);
- }
- if (aVal < bVal) {
- return -1;
- } else {
- return 1;
- }
- }
- }
-*/
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
deleted file mode 100644
index 62cf348..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.batch;
-
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.sql.SQLException;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.ArrayBlockingQueue;
-
-import org.apache.tika.batch.ConsumersManager;
-import org.apache.tika.batch.FileResource;
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.batch.builders.AbstractConsumersBuilder;
-import org.apache.tika.batch.builders.BatchProcessBuilder;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.eval.db.H2Util;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.eval.db.MimeBuffer;
-import org.apache.tika.eval.langid.LanguageIDWrapper;
-import org.apache.tika.util.ClassLoaderUtil;
-import org.apache.tika.util.PropsUtil;
-import org.apache.tika.util.XMLDOMUtil;
-import org.w3c.dom.Node;
-
-public class EvalConsumersBuilder extends AbstractConsumersBuilder {
-
- @Override
- public ConsumersManager build(Node node, Map<String, String> runtimeAttributes,
- ArrayBlockingQueue<FileResource> queue) {
-
- List<FileResourceConsumer> consumers = new LinkedList<>();
- int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes);
-
- Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes);
-
-
- Path db = getPath(localAttrs, "db");
- String jdbcConnectionString = localAttrs.get("jdbc");
-
-
- Path commonTokens = getPath(localAttrs, "commonTokens");
- String defaultLangCode = localAttrs.get("defaultLangCode");
- if (defaultLangCode == null) {
- defaultLangCode = "";
- }
- //can be null, in which case will load from memory
- try {
- AbstractProfiler.loadCommonTokens(commonTokens, defaultLangCode);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
-
- JDBCUtil jdbcUtil = null;
- if (db != null) {
- jdbcUtil = new H2Util(db);
- } else if (jdbcConnectionString != null) {
- jdbcUtil = new JDBCUtil(jdbcConnectionString, localAttrs.get("jdbcDriver"));
- } else {
- throw new RuntimeException("Must specify: -db or -jdbc");
- }
- EvalConsumerBuilder consumerBuilder = ClassLoaderUtil.buildClass(EvalConsumerBuilder.class,
- PropsUtil.getString(localAttrs.get("consumerBuilderClass"), null));
- if (consumerBuilder == null) {
- throw new RuntimeException("Must specify consumerBuilderClass in config file");
- }
-
- boolean forceDrop = PropsUtil.getBoolean(localAttrs.get("drop"), false);
- MimeBuffer mimeBuffer = null;
- try {
- mimeBuffer = consumerBuilder.init(queue, localAttrs, jdbcUtil, forceDrop);
- } catch (IOException | SQLException e) {
- throw new RuntimeException(e);
- }
-
- for (int i = 0; i < numConsumers; i++) {
- try {
- consumers.add(consumerBuilder.build());
- } catch (IOException | SQLException e) {
- throw new RuntimeException(e);
- }
- }
-
- DBConsumersManager manager;
- try {
- manager = new DBConsumersManager(jdbcUtil, mimeBuffer, consumers);
- } catch (SQLException e) {
- throw new RuntimeException(e);
- }
- consumerBuilder.addErrorLogTablePairs(manager);
-
- return manager;
- }
-
- private Path getNonNullPath(Map<String, String> attrs, String key) {
- Path p = getPath(attrs, key);
- if (p == null) {
- throw new RuntimeException("Must specify a file for this attribute: "+key);
- }
- return p;
- }
-
-
- protected Path getPath(Map<String, String> attrs, String key) {
- String filePath = attrs.get(key);
- if (filePath == null) {
- return null;
- }
- return Paths.get(filePath);
- }
-
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
deleted file mode 100644
index 0ae893f..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.batch;
-
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.eval.ExtractComparer;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-public class ExtractComparerBuilder extends EvalConsumerBuilder {
- public final static String TABLE_PREFIX_A_KEY = "tablePrefixA";
- public final static String TABLE_PREFIX_B_KEY = "tablePrefixB";
-
- private final List<TableInfo> tableInfosA;
- private final List<TableInfo> tableInfosB;
- private final List<TableInfo> tableInfosAandB;
- private final List<TableInfo> refTableInfos;
-
- public ExtractComparerBuilder() {
- List<TableInfo> tableInfosA = new ArrayList<>();
- List<TableInfo> tableInfosB = new ArrayList<>();
- List<TableInfo> tableInfosAandB = new ArrayList<>();
- tableInfosA.add(ExtractComparer.PROFILES_A);
- tableInfosA.add(ExtractComparer.EXCEPTION_TABLE_A);
- tableInfosA.add(ExtractComparer.TAGS_TABLE_A);
- tableInfosA.add(ExtractComparer.CONTENTS_TABLE_A);
- tableInfosA.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
- tableInfosA.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A);
-
- tableInfosB.add(ExtractComparer.PROFILES_B);
- tableInfosB.add(ExtractComparer.EXCEPTION_TABLE_B);
- tableInfosB.add(ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
- tableInfosB.add(ExtractComparer.TAGS_TABLE_B);
- tableInfosB.add(ExtractComparer.CONTENTS_TABLE_B);
- tableInfosB.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B);
-
- tableInfosAandB.add(ExtractComparer.COMPARISON_CONTAINERS);
- tableInfosAandB.add(ExtractComparer.CONTENT_COMPARISONS);
- tableInfosAandB.add(AbstractProfiler.MIME_TABLE);
-
- List<TableInfo> refTableInfos = new ArrayList<>();
- refTableInfos.add(ExtractComparer.REF_PAIR_NAMES);
- refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
- refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
- refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
-
- this.tableInfosA = Collections.unmodifiableList(tableInfosA);
- this.tableInfosB = Collections.unmodifiableList(tableInfosB);
- this.tableInfosAandB = Collections.unmodifiableList(tableInfosAandB);
- this.refTableInfos = Collections.unmodifiableList(refTableInfos);
- }
-
- @Override
- public FileResourceConsumer build() throws IOException, SQLException {
- Path extractsA = PropsUtil.getPath(localAttrs.get("extractsA"), null);
- if (extractsA == null) {
- throw new RuntimeException("Must specify \"extractsA\" -- directory for 'A' extracts");
- }
- Path extractsB = PropsUtil.getPath(localAttrs.get("extractsB"), null);
- if (extractsB == null) {
- throw new RuntimeException("Must specify \"extractsB\" -- directory for 'B' extracts");
- }
-
- Path inputRootDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
-
- if (inputRootDir == null) {
- //this is for the sake of the crawler
- throw new RuntimeException("Must specify an -inputDir");
- }
-
- return parameterizeProfiler(new ExtractComparer(queue, inputRootDir, extractsA, extractsB,
- buildExtractReader(localAttrs),
- getDBWriter(getNonRefTableInfos())));
- }
-
-
- @Override
- protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
- String tablePrefixA = localAttrs.get(TABLE_PREFIX_A_KEY);
-
- String tablePrefixB = localAttrs.get(TABLE_PREFIX_B_KEY);
-
- tablePrefixA = (tablePrefixA == null || tablePrefixA.endsWith("_")) ? tablePrefixA : tablePrefixA+"_";
- tablePrefixB = (tablePrefixB == null || tablePrefixB.endsWith("_")) ? tablePrefixB : tablePrefixB+"_";
-
- if (tablePrefixA != null) {
- for (TableInfo tableInfo : tableInfosA) {
- tableInfo.setNamePrefix(tablePrefixA);
- }
- }
-
- if (tablePrefixB != null) {
- for (TableInfo tableInfo : tableInfosB) {
- tableInfo.setNamePrefix(tablePrefixB);
- }
- }
-
- if (tablePrefixA != null || tablePrefixB != null) {
- String aAndB = (tablePrefixA == null) ? "" : tablePrefixA;
- aAndB = (tablePrefixB == null) ? aAndB : aAndB+tablePrefixB;
- for (TableInfo tableInfo : tableInfosAandB) {
- tableInfo.setNamePrefix(aAndB);
- }
- }
- }
-
- @Override
- protected List<TableInfo> getRefTableInfos() {
- return refTableInfos;
- }
-
- @Override
- protected List<TableInfo> getNonRefTableInfos() {
- List<TableInfo> allNonRefTables = new ArrayList<>();
- allNonRefTables.addAll(tableInfosA);
- allNonRefTables.addAll(tableInfosB);
- allNonRefTables.addAll(tableInfosAandB);
- return Collections.unmodifiableList(allNonRefTables);
- }
-
- @Override
- protected TableInfo getMimeTable() {
- return AbstractProfiler.MIME_TABLE;
- }
-
- @Override
- protected void addErrorLogTablePairs(DBConsumersManager manager) {
- Path errorLogA = PropsUtil.getPath(localAttrs.get("errorLogFileA"), null);
- if (errorLogA == null) {
- return;
- }
- manager.addErrorLogTablePair(errorLogA, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A);
- Path errorLogB = PropsUtil.getPath(localAttrs.get("errorLogFileB"), null);
- if (errorLogB == null) {
- return;
- }
- manager.addErrorLogTablePair(errorLogB, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B);
-
- }
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
deleted file mode 100644
index 20efbf7..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.batch;
-
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.eval.ExtractProfiler;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-
-public class ExtractProfilerBuilder extends EvalConsumerBuilder {
-
- public final static String TABLE_PREFIX_KEY = "tablePrefix";
-
- private final List<TableInfo> tableInfos;
- private final List<TableInfo> refTableInfos;
- public ExtractProfilerBuilder() {
- List<TableInfo> tableInfos = new ArrayList();
- tableInfos.add(AbstractProfiler.MIME_TABLE);
- tableInfos.add(ExtractProfiler.CONTAINER_TABLE);
- tableInfos.add(ExtractProfiler.PROFILE_TABLE);
- tableInfos.add(ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
- tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
- tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
- tableInfos.add(ExtractProfiler.TAGS_TABLE);
- tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
- this.tableInfos = Collections.unmodifiableList(tableInfos);
-
- List<TableInfo> refTableInfos = new ArrayList<>();
- refTableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
- refTableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
- refTableInfos.add(AbstractProfiler.REF_EXTRACT_EXCEPTION_TYPES);
- this.refTableInfos = Collections.unmodifiableList(refTableInfos);
- }
-
- @Override
- public FileResourceConsumer build() throws IOException, SQLException {
- Path extracts = PropsUtil.getPath(localAttrs.get("extracts"), null);
- if (extracts == null) {
- throw new RuntimeException("Must specify \"extracts\" -- directory to crawl");
- }
- if (!Files.isDirectory(extracts)) {
- throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " +
- extracts.toAbsolutePath());
- }
-
- Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
-
- //we _could_ set this to extracts (if not null)
- //here, but the Crawler defaults to "input" if nothing is passed
- //so this won't work
- if (inputDir == null) {
- throw new RuntimeException("Must specify -inputDir");
- }
- if (extracts == null && inputDir != null) {
- extracts = inputDir;
- }
- return parameterizeProfiler(new ExtractProfiler(queue, inputDir, extracts,
- buildExtractReader(localAttrs),
- getDBWriter(tableInfos)));
- }
-
-
- @Override
- protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
- String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
- if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
- for (TableInfo tableInfo : tableInfos) {
- tableInfo.setNamePrefix(tableNamePrefix);
- }
- }
- }
-
-
- @Override
- protected List<TableInfo> getRefTableInfos() {
- return refTableInfos;
- }
-
- @Override
- protected List<TableInfo> getNonRefTableInfos() {
- return tableInfos;
- }
-
- @Override
- protected TableInfo getMimeTable() {
- return AbstractProfiler.MIME_TABLE;
- }
-
- @Override
- protected void addErrorLogTablePairs(DBConsumersManager manager) {
- Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
- if (errorLog == null) {
- return;
- }
- manager.addErrorLogTablePair(errorLog, ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
deleted file mode 100644
index a54dd55..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.batch;
-
-import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.ExtractProfiler;
-import org.apache.tika.eval.FileProfiler;
-import org.apache.tika.eval.db.TableInfo;
-import org.apache.tika.util.PropsUtil;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.sql.SQLException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-
-public class FileProfilerBuilder extends EvalConsumerBuilder {
-
- public final static String TABLE_PREFIX_KEY = "tablePrefix";
-
- private final List<TableInfo> tableInfos;
-
- public FileProfilerBuilder() {
- List<TableInfo> tableInfos = new ArrayList();
- tableInfos.add(FileProfiler.FILE_MIME_TABLE);
- tableInfos.add(FileProfiler.FILE_PROFILES);
- this.tableInfos = Collections.unmodifiableList(tableInfos);
-
- }
-
- @Override
- public FileResourceConsumer build() throws IOException, SQLException {
-
- Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
-
- //we _could_ set this to extracts (if not null)
- //here, but the Crawler defaults to "input" if nothing is passed
- //so this won't work
- if (inputDir == null) {
- throw new RuntimeException("Must specify -inputDir");
- }
- return parameterizeProfiler(new FileProfiler(queue, inputDir,
- getDBWriter(tableInfos)));
- }
-
-
- @Override
- protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
- String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
- if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
- for (TableInfo tableInfo : tableInfos) {
- tableInfo.setNamePrefix(tableNamePrefix);
- }
- }
- }
-
- @Override
- protected List<TableInfo> getRefTableInfos() {
- return Collections.EMPTY_LIST;
- }
-
- @Override
- protected List<TableInfo> getNonRefTableInfos() {
- return tableInfos;
- }
-
- @Override
- protected TableInfo getMimeTable() {
- return FileProfiler.FILE_MIME_TABLE;
- }
-
- @Override
- protected void addErrorLogTablePairs(DBConsumersManager manager) {
- Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
- if (errorLog == null) {
- return;
- }
- manager.addErrorLogTablePair(errorLog, ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java
deleted file mode 100644
index baa7994..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.locks.Lock;
-import java.util.concurrent.locks.ReentrantReadWriteLock;
-
-
-/**
- * Abstract buffer for map of values and unique ids.
- * <p>
- * Use this for fast in memory lookups of smallish sets of values.
- *
- */
-abstract class AbstractDBBuffer {
-
- private final Map<String, Integer> m = new HashMap<>();
- private final ReentrantReadWriteLock rwl = new ReentrantReadWriteLock();
- private final Lock r = rwl.readLock();
- private final Lock w = rwl.writeLock();
-
- private int numWrites = 0;
-
- public int getId(String key) {
- r.lock();
- try {
- Integer v = m.get(key);
- if (v != null) {
- return v;
- }
- } finally {
- r.unlock();
- }
-
- try {
- w.lock();
- Integer v = m.get(key);
- if (v != null) {
- return v;
- }
- v = m.size()+1;
- m.put(key, v);
- write(v, key);
- numWrites++;
- return v;
- } finally {
- w.unlock();
- }
- }
-
- public int getNumWrites() {
- return numWrites;
- }
-
- //Odd to throw RuntimeException, I know. It should be
- //catastrophic if this buffer can't write to the db.
- public abstract void write(int id, String value) throws RuntimeException;
-
- public abstract void close() throws SQLException;
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java b/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java
deleted file mode 100644
index 369cb0a..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-
-import java.sql.Types;
-import java.util.Objects;
-
-public class ColInfo {
- private final Cols name;
- private final int type;
- private final Integer precision;
- private final String constraints;
-
- public ColInfo(Cols name, int type) {
- this(name, type, null, null);
- }
-
- public ColInfo(Cols name, int type, String constraints) {
- this(name, type, null, constraints);
- }
-
- public ColInfo(Cols name, int type, Integer precision) {
- this(name, type, precision, null);
- }
-
-
- public ColInfo(Cols name, int type, Integer precision, String constraints) {
- this.name = name;
- this.type = type;
- this.precision = precision;
- this.constraints = constraints;
- }
-
- public int getType() {
- return type;
- }
-
- public Cols getName() {
- return name;
- }
- /**
- *
- * @return constraints string or null
- */
- public String getConstraints() {
- return constraints;
- }
-
- /**
- * Gets the precision. This can be null!
- * @return precision or null
- */
- public Integer getPrecision() {
- return precision;
- }
-
- public String getSqlDef() {
- if (type == Types.VARCHAR){
- return "VARCHAR("+precision+")";
- } else if (type == Types.CHAR) {
- return "CHAR("+precision+")";
- }
- switch (type) {
- case Types.FLOAT :
- return "FLOAT";
- case Types.DOUBLE :
- return "DOUBLE";
- case Types.BLOB :
- return "BLOB";
- case Types.INTEGER :
- return "INTEGER";
- case Types.BIGINT :
- return "BIGINT";
- case Types.BOOLEAN :
- return "BOOLEAN";
- }
- throw new UnsupportedOperationException("Don't yet recognize a type for: "+type);
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- ColInfo colInfo = (ColInfo) o;
-
- if (type != colInfo.type) {
- return false;
- }
- if (name != colInfo.name) {
- return false;
- }
- if (!Objects.equals(precision, colInfo.precision)) {
- return false;
- }
- return Objects.equals(constraints, colInfo.constraints);
-
- }
-
- @Override
- public int hashCode() {
- int result = name != null ? name.hashCode() : 0;
- result = 31 * result + type;
- result = 31 * result + (precision != null ? precision.hashCode() : 0);
- result = 31 * result + (constraints != null ? constraints.hashCode() : 0);
- return result;
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
deleted file mode 100644
index f0e0955..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-import java.sql.Types;
-
-public enum Cols {
- //container table
- CONTAINER_ID,
- FILE_PATH,
- EXTRACT_FILE_LENGTH,
-
- EXTRACT_FILE_LENGTH_A, //for comparisons
- EXTRACT_FILE_LENGTH_B,
-
- //profile table
- ID,
- LENGTH,
- FILE_NAME,
- FILE_EXTENSION,
- ELAPSED_TIME_MILLIS,
- NUM_METADATA_VALUES,
- IS_EMBEDDED,
- EMBEDDED_FILE_PATH,
- MIME_ID,
- TIKA_MIME_ID,
- FILE_MIME_ID,
- SHA256,
- MD5,
- NUM_ATTACHMENTS,
- HAS_CONTENT,
-
- //content
- CONTENT_LENGTH,
- NUM_UNIQUE_TOKENS,
- NUM_TOKENS,
- NUM_UNIQUE_ALPHABETIC_TOKENS,
- NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens
- COMMON_TOKENS_LANG, //which language was used for the common tokens metric?
- NUM_UNIQUE_COMMON_TOKENS,
- NUM_COMMON_TOKENS,
- TOP_N_TOKENS,
- LANG_ID_1,
- LANG_ID_PROB_1,
- LANG_ID_2,
- LANG_ID_PROB_2,
- TOKEN_ENTROPY_RATE,
- TOKEN_LENGTH_SUM,
- TOKEN_LENGTH_MEAN,
- TOKEN_LENGTH_STD_DEV,
- UNICODE_CHAR_BLOCKS,
- NUM_PAGES, //number of pages a document alleges it has
- CONTENT_TRUNCATED_AT_MAX_LEN, // was the string truncated at AbstractProfiler.MAX_STRING_LENGTH
-
- //content comparisons
- TOP_10_UNIQUE_TOKEN_DIFFS_A,
- TOP_10_UNIQUE_TOKEN_DIFFS_B,
- TOP_10_MORE_IN_A,
- TOP_10_MORE_IN_B,
- OVERLAP,
- DICE_COEFFICIENT,
-
- //errors
- PARSE_ERROR_ID,
-
- PARSE_ERROR_DESCRIPTION,
- PARSE_EXCEPTION_DESCRIPTION,
-
- EXTRACT_EXCEPTION_ID,
- EXTRACT_EXCEPTION_DESCRIPTION,
-
-
- //exceptions
- ORIG_STACK_TRACE,
- SORT_STACK_TRACE,
- PARSE_EXCEPTION_ID,
-
-
- MIME_STRING,//string representation of mime type
-
- DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
- DIR_NAME_B,
-
- //structure tags
- TAGS_A,
- TAGS_B,
- TAGS_DIV,
- TAGS_I,
- TAGS_IMG,
- TAGS_LI,
- TAGS_P,
- TAGS_OL,
- TAGS_TABLE,
- TAGS_TD,
- TAGS_TITLE,
- TAGS_TR,
- TAGS_UL,
- TAGS_U,
- TAGS_PARSE_EXCEPTION, //if there was a SAX|IO|TikaException while parsing the html or xhtml
- ;
-
-}
-
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java
deleted file mode 100644
index 8d936c2..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-
-public class DBBuffer extends AbstractDBBuffer {
-
- private final PreparedStatement st;
-
- public DBBuffer(Connection connection, String tableName,
- String idColumnName, String valueColumnName) throws SQLException {
- st = connection.prepareStatement("insert into "+tableName+ "( "+
- idColumnName + ", " + valueColumnName+") values (?,?);");
- }
-
- @Override
- public void write(int id, String value) throws RuntimeException {
- try {
- st.clearParameters();
- st.setInt(1, id);
- st.setString(2, value);
- st.execute();
-
- } catch (SQLException e) {
- throw new RuntimeException(e);
- }
- }
-
- @Override
- public void close() throws SQLException {
- st.close();
-
- }
-
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java b/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java
deleted file mode 100644
index 14037be..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-import java.nio.file.Path;
-import java.sql.Connection;
-import java.sql.DriverManager;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.apache.commons.io.FilenameUtils;
-
-
-public class H2Util extends JDBCUtil {
-
- private final static String DRIVER_CLASS = "org.h2.Driver";
-
- private final Path db;
-
- public H2Util(Path db) {
- super(getConnectionString(db, true), DRIVER_CLASS);
- this.db = db;
- }
-
- public static boolean databaseExists(Path db) {
- String connString = getConnectionString(db, false);
- try (Connection conn = DriverManager.getConnection(connString)) {
- } catch (SQLException e) {
- return false;
- }
- return true;
- }
- @Override
- public String getJDBCDriverClass() {
- return "org.h2.Driver";
- }
-
- @Override
- public boolean dropTableIfExists(Connection conn, String tableName) throws SQLException {
- Statement st = conn.createStatement();
- String sql = "drop table if exists "+tableName;
- boolean success = st.execute(sql);
- st.close();
- return success;
- }
-
- @Override
- public String getConnectionString() {
- return getConnectionString(db, true);
- }
-
- private static String getConnectionString(Path db, boolean createDBIfItDoesntExist) {
- String s = "jdbc:h2:"+ FilenameUtils.separatorsToUnix(db.toAbsolutePath().toString());
- if (! createDBIfItDoesntExist) {
- s += ";IFEXISTS=TRUE";
- }
- return s;
- }
-
- @Override
- public Set<String> getTables(Connection connection) throws SQLException {
- String sql = "SHOW TABLES";
- Statement st = connection.createStatement();
- ResultSet rs = st.executeQuery(sql);
- Set<String> tables = new HashSet<>();
- while (rs.next()) {
- String table = rs.getString(1);
- tables.add(table);
- }
- return tables;
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java b/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
deleted file mode 100644
index 33f1279..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
+++ /dev/null
@@ -1,316 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval.db;
-
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.sql.Connection;
-import java.sql.DatabaseMetaData;
-import java.sql.DriverManager;
-import java.sql.PreparedStatement;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.sql.Types;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class JDBCUtil {
- private static final Logger LOG = LoggerFactory.getLogger(JDBCUtil.class);
-
- public enum CREATE_TABLE {
- DROP_IF_EXISTS,
- SKIP_IF_EXISTS,
- THROW_EX_IF_EXISTS,
- }
-
- private final String connectionString;
- private String driverClass;
- private Connection connection = null;
-
- public JDBCUtil(String connectionString, String driverClass) {
- this.connectionString = connectionString;
- this.driverClass = driverClass;
- if (driverClass == null || driverClass.length() == 0) {
- if (System.getProperty("jdbc.drivers") != null) {
- //user has specified it on the command line
- //stop now
- } else {
- //try to use the mappings in db.properties to determine the class
- try (InputStream is = JDBCUtil.class.getResourceAsStream("/db.properties")) {
- Properties properties = new Properties();
- properties.load(is);
- for (String k : properties.stringPropertyNames()) {
- Matcher m = Pattern.compile("(?i)jdbc:" + k).matcher(connectionString);
- if (m.find()) {
- this.driverClass = properties.getProperty(k);
- }
- }
-
- } catch (IOException e) {
-
- }
- }
- }
- }
-
- /**
- * Override this any optimizations you want to do on the db
- * before writing/reading.
- *
- * @return
- * @throws IOException
- */
- public Connection getConnection() throws SQLException {
- if (connection != null) {
- return connection;
- }
- String connectionString = getConnectionString();
- String jdbcDriver = getJDBCDriverClass();
- if (jdbcDriver != null) {
- try {
- Class.forName(getJDBCDriverClass());
- } catch (ClassNotFoundException e) {
- throw new RuntimeException(e);
- }
- }
- connection = DriverManager.getConnection(connectionString);
- connection.setAutoCommit(false);
-
- return connection;
- }
-
- /**
- * JDBC driver class. Override as necessary.
- *
- * @return
- */
- public String getJDBCDriverClass() {
- return driverClass;
- }
-
-
- public boolean dropTableIfExists(Connection conn, String tableName) throws SQLException {
- if (containsTable(tableName)) {
- try (Statement st = conn.createStatement()) {
- String sql = "drop table " + tableName;
- return st.execute(sql);
- }
- }
- return true;
- }
-
-
- public String getConnectionString() {
- return connectionString;
- }
-
-
- public Set<String> getTables(Connection connection) throws SQLException {
- Set<String> tables = new HashSet<>();
-
- DatabaseMetaData dbMeta = connection.getMetaData();
-
- try (ResultSet rs = dbMeta.getTables(null, null, "%", null)) {
- while (rs.next()) {
- tables.add(rs.getString(3).toLowerCase(Locale.US));
- }
- }
- return tables;
- }
-
- @Deprecated
- /**
- * @deprecated use {@link #batchInsert(PreparedStatement, TableInfo, Map)}
- */
- public static int insert(PreparedStatement insertStatement,
- TableInfo table,
- Map<Cols, String> data) throws SQLException {
-
- //clear parameters before setting
- insertStatement.clearParameters();
- try {
- int i = 1;
- for (ColInfo colInfo : table.getColInfos()) {
- updateInsertStatement(i, insertStatement, colInfo, data.get(colInfo.getName()));
- i++;
- }
- for (Cols c : data.keySet()) {
- if (!table.containsColumn(c)) {
- throw new IllegalArgumentException("Can't add data to " + c +
- " because it doesn't exist in the table: " + table.getName());
- }
- }
- return insertStatement.executeUpdate();
- } catch (SQLException e) {
- LOG.warn("couldn't insert data for this row: {}", e.getMessage());
- return -1;
- }
- }
-
- public static void batchInsert(PreparedStatement insertStatement,
- TableInfo table,
- Map<Cols, String> data) throws SQLException {
-
- try {
- int i = 1;
- for (ColInfo colInfo : table.getColInfos()) {
- updateInsertStatement(i, insertStatement, colInfo, data.get(colInfo.getName()));
- i++;
- }
- for (Cols c : data.keySet()) {
- if (!table.containsColumn(c)) {
- throw new IllegalArgumentException("Can't add data to " + c +
- " because it doesn't exist in the table: " + table.getName());
- }
- }
- insertStatement.addBatch();
- } catch (SQLException e) {
- LOG.warn("couldn't insert data for this row: {}", e.getMessage());
- }
- }
-
- public static void updateInsertStatement(int dbColOffset, PreparedStatement st,
- ColInfo colInfo, String value) throws SQLException {
- if (value == null) {
- st.setNull(dbColOffset, colInfo.getType());
- return;
- }
- try {
- switch (colInfo.getType()) {
- case Types.VARCHAR:
- if (value != null && value.length() > colInfo.getPrecision()) {
- value = value.substring(0, colInfo.getPrecision());
- LOG.warn("truncated varchar value in {} : {}", colInfo.getName(), value);
- }
- //postgres doesn't allow \0000
- value = value.replaceAll("\u0000", " ");
- st.setString(dbColOffset, value);
- break;
- case Types.CHAR:
- //postgres doesn't allow \0000
- value = value.replaceAll("\u0000", " ");
- st.setString(dbColOffset, value);
- break;
- case Types.DOUBLE:
- st.setDouble(dbColOffset, Double.parseDouble(value));
- break;
- case Types.FLOAT:
- st.setDouble(dbColOffset, Float.parseFloat(value));
- break;
- case Types.INTEGER:
- st.setInt(dbColOffset, Integer.parseInt(value));
- break;
- case Types.BIGINT:
- st.setLong(dbColOffset, Long.parseLong(value));
- break;
- case Types.BOOLEAN:
- st.setBoolean(dbColOffset, Boolean.parseBoolean(value));
- break;
- default:
- throw new UnsupportedOperationException("Don't yet support type: " + colInfo.getType());
- }
- } catch (NumberFormatException e) {
- if (!"".equals(value)) {
- LOG.warn("number format exception: {} : {}", colInfo.getName(), value);
- }
- st.setNull(dbColOffset, colInfo.getType());
- } catch (SQLException e) {
- LOG.warn("sqlexception: {} : {}", colInfo, value);
- st.setNull(dbColOffset, colInfo.getType());
- }
- }
-
- public void createTables(List<TableInfo> tableInfos, CREATE_TABLE createTable) throws SQLException, IOException {
-
- Connection conn = getConnection();
- for (TableInfo tableInfo : tableInfos) {
-
- if (createTable.equals(CREATE_TABLE.DROP_IF_EXISTS)) {
- dropTableIfExists(conn, tableInfo.getName());
- } else if (createTable.equals(CREATE_TABLE.SKIP_IF_EXISTS)) {
- if (containsTable(tableInfo.getName())) {
- continue;
- }
- }
- createTable(conn, tableInfo);
- }
- conn.commit();
-
- }
-
- public boolean containsTable(String tableName) throws SQLException {
- Connection connection = getConnection();
- Set<String> tables = getTables(connection);
- if (tables.contains(normalizeTableName(tableName))) {
- return true;
- }
-
- return false;
- }
-
- /**
- * Override for custom behavior
- *
- * @param tableName
- * @return
- */
- String normalizeTableName(String tableName) {
- tableName = tableName.toLowerCase(Locale.US);
- return tableName;
- }
-
- //does not close the connection
- private void createTable(Connection conn, TableInfo tableInfo) throws SQLException {
- StringBuilder createSql = new StringBuilder();
- createSql.append("CREATE TABLE " + tableInfo.getName());
- createSql.append("(");
-
- int last = 0;
- for (ColInfo col : tableInfo.getColInfos()) {
- last++;
- if (last > 1) {
- createSql.append(", ");
- }
- createSql.append(col.getName());
- createSql.append(" ");
- createSql.append(col.getSqlDef());
- String constraints = col.getConstraints();
- if (constraints != null) {
- createSql.append(" ");
- createSql.append(constraints);
- }
- }
- createSql.append(")");
- try (Statement st = conn.createStatement()) {
- st.execute(createSql.toString());
- st.close();
- }
- conn.commit();
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
deleted file mode 100644
index eba731b..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-import java.sql.Types;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.eval.AbstractProfiler;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeType;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.mime.MimeTypes;
-
-
-public class MimeBuffer extends AbstractDBBuffer {
-
- private final PreparedStatement st;
- private final TikaConfig config;
- private final Connection connection;
-
-
- public MimeBuffer(Connection connection, TableInfo mimeTable, TikaConfig config) throws SQLException {
- st = connection.prepareStatement("insert into " + mimeTable.getName() + "( " +
- Cols.MIME_ID.name() + ", " +
- Cols.MIME_STRING.name() + ", " +
- Cols.FILE_EXTENSION.name() + ") values (?,?,?)");
- this.config = config;
- this.connection = connection;
- }
-
- @Override
- public void write(int id, String value) throws RuntimeException {
- try {
- st.clearParameters();
- st.setInt(1, id);
- st.setString(2, value);
- try {
- String ext = MimeUtil.getExtension(value, config);
- if (ext == null || ext.length() == 0) {
- st.setNull(3, Types.VARCHAR);
- } else {
- st.setString(3, ext);
- }
- } catch (MimeTypeException e) {
- st.setNull(3, Types.VARCHAR);
- }
- st.execute();
-
- } catch (SQLException e) {
- throw new RuntimeException(e);
- }
- }
-
- @Override
- public void close() throws SQLException {
- st.close();
- connection.commit();
- }
-
- private static class MimeUtil {
- //TODO: see if MimeType now works for these
- private static final String APPLICATION = "application";
- private static final String TEXT = "text";
- private static final String HTML = "html";
- private static final String XML = "xml";
- private static final String XHTML_XML = "xhtml+xml";
- private static final String CSS = "css";
- private static final String CSV = "csv";
- private static final String PLAIN = "plain";
- private static final String EMPTY_STRING = "";
-
- /**
- * Utility method to convert from a string value representing a content type
- * (e.g. "application/pdf") into the most common extension for that file type
- * (e.g. "pdf").
- * <p>
- * This will has special handling for texty filetypes whose MimeTypes
- * don't currently return anything for {@link MimeType#getExtension};
- *
- * @param contentType string representing a content type, for example: "application/pdf"
- * @param config config from which to get MimeRepository
- * @return extension or empty string
- * @throws MimeTypeException thrown if MimeTypes can't parse the contentType
- */
- public static String getExtension(String contentType, TikaConfig config)
- throws MimeTypeException {
- MimeTypes types = config.getMimeRepository();
- MimeType mime = types.forName(contentType);
- return getExtension(mime);
- }
-
- public static String getExtension(MimeType mime) {
-
- String ext = mime.getExtension();
- if (ext.startsWith(".")) {
- ext = ext.substring(1);
- }
-
- //special handling for text/html/xml
- if (ext.length() == 0) {
- ext = tryTextyTypes(mime.getType());
- }
- return ext;
- }
-
- private static String tryTextyTypes(MediaType mediaType) {
-
- String type = mediaType.getType();
- String subtype = mediaType.getSubtype();
- if (type.equals(TEXT)) {
- if (subtype.equals(HTML)) {
- return HTML;
- } else if (subtype.equals(PLAIN)) {
- return "txt";
- } else if (subtype.equals(CSS)) {
- return CSS;
- } else if (subtype.equals(CSV)) {
- return CSV;
- }
- } else if (type.equals(APPLICATION)) {
- if (subtype.equals(XML)) {
- return XML;
- } else if (subtype.equals(XHTML_XML)) {
- return "html";
- }
- }
- return EMPTY_STRING;
- }
- }
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java b/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java
deleted file mode 100644
index 181a028..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.db;
-
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-
-public class TableInfo {
-
- private final String name;
- private final List<ColInfo> colInfos = new ArrayList<>();
- private final Set<Cols> colNames = new HashSet<>();
-
- private String prefix;
-
- public TableInfo(String name, ColInfo... cols) {
- Collections.addAll(colInfos, cols);
- Collections.unmodifiableList(colInfos);
- this.name = name;
- for (ColInfo c : colInfos) {
- assert (!colNames.contains(c.getName()));
- colNames.add(c.getName());
- }
- }
-
- public TableInfo(String name, List<ColInfo> cols) {
- colInfos.addAll(cols);
- Collections.unmodifiableList(colInfos);
- this.name = name;
- for (ColInfo c : colInfos) {
- assert (!colNames.contains(c.getName()));
- colNames.add(c.getName());
- }
- }
-
- public String getName() {
- if (prefix == null) {
- return name;
- }
- return prefix+name;
- }
-
- public void setNamePrefix(String prefix) {
- this.prefix = prefix;
- }
-
- public List<ColInfo> getColInfos() {
- return colInfos;
- }
-
- public boolean containsColumn(Cols cols) {
- return colNames.contains(cols);
- }
-}
-
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
deleted file mode 100644
index 5d81016..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.io;
-
-import java.io.IOException;
-import java.sql.Connection;
-import java.sql.PreparedStatement;
-import java.sql.SQLException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
-
-import org.apache.commons.io.IOExceptionWithCause;
-import org.apache.tika.eval.db.ColInfo;
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.eval.db.MimeBuffer;
-import org.apache.tika.eval.db.TableInfo;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This is still in its early stages. The idea is to
- * get something working with h2 and then add to that
- * as necessary.
- *
- * Beware, this deletes the db file with each initialization.
- *
- * Each thread must construct its own DBWriter because each
- * DBWriter creates its own PreparedStatements at initialization.
- */
-public class DBWriter implements IDBWriter {
-
- private static final Logger LOG = LoggerFactory.getLogger(DBWriter.class);
-
- private static final AtomicInteger WRITER_ID = new AtomicInteger();
- private final Long commitEveryXRows = 10000L;
- //private final Long commitEveryXMS = 60000L;
-
- private final Connection conn;
- private final JDBCUtil dbUtil;
- private final MimeBuffer mimeBuffer;
- private final int myId = WRITER_ID.getAndIncrement();
-
- //<tableName, preparedStatement>
- private final Map<String, PreparedStatement> inserts = new HashMap<>();
- private final Map<String, LastInsert> lastInsertMap = new HashMap<>();
- public DBWriter(Connection connection, List<TableInfo> tableInfos, JDBCUtil dbUtil, MimeBuffer mimeBuffer)
- throws IOException, SQLException {
-
- this.conn = connection;
- this.mimeBuffer = mimeBuffer;
- this.dbUtil = dbUtil;
- for (TableInfo tableInfo : tableInfos) {
- try {
- PreparedStatement st = createPreparedInsert(tableInfo);
- inserts.put(tableInfo.getName(), st);
- lastInsertMap.put(tableInfo.getName(), new LastInsert());
- } catch (SQLException e) {
- throw new RuntimeException(e);
- }
- }
- }
-
- public int getMimeId(String mimeString) {
- return mimeBuffer.getId(mimeString);
- }
-
- private PreparedStatement createPreparedInsert(TableInfo tableInfo) throws SQLException {
- StringBuilder sb = new StringBuilder();
- sb.append("INSERT INTO ").append(tableInfo.getName());
- sb.append("(");
- int i = 0;
- for (ColInfo c : tableInfo.getColInfos()) {
- if (i++ > 0) {
- sb.append(", ");
- }
- sb.append(c.getName());
- }
- sb.append(") ");
-
- sb.append("VALUES");
- sb.append("(");
- for (int j = 0; j < i; j++) {
- if (j > 0) {
- sb.append(", ");
- }
- sb.append("?");
- }
- sb.append(")");
-
- return conn.prepareStatement(sb.toString());
- }
-
-
- public void writeRow(TableInfo table, Map<Cols, String> data) throws IOException {
- try {
- PreparedStatement p = inserts.get(table.getName());
- if (p == null) {
- throw new RuntimeException("Failed to create prepared statement for: "+
- table.getName());
- }
- dbUtil.batchInsert(p, table, data);
- LastInsert lastInsert = lastInsertMap.get(table.getName());
- lastInsert.rowCount++;
- long elapsed = System.currentTimeMillis()-lastInsert.lastInsert;
- if (
- //elapsed > commitEveryXMS ||
- lastInsert.rowCount % commitEveryXRows == 0) {
- LOG.info("writer ({}) on table ({}) is committing after {} rows and {} ms", myId,
- table.getName(),
- lastInsert.rowCount, elapsed);
- p.executeBatch();
- conn.commit();
- lastInsert.lastInsert = System.currentTimeMillis();
- }
- } catch (SQLException e) {
- throw new IOException(e);
- }
- }
-
- /**
- * This closes the writer by executing batch and
- * committing changes. This DOES NOT close the connection
- * @throws IOException
- */
- public void close() throws IOException {
- for (PreparedStatement p : inserts.values()) {
- try {
- p.executeBatch();
- } catch (SQLException e) {
- throw new IOExceptionWithCause(e);
- }
- }
- try {
- conn.commit();
- } catch (SQLException e){
- throw new IOExceptionWithCause(e);
- }
- }
-
- private class LastInsert {
- private long lastInsert = System.currentTimeMillis();
- private long rowCount = 0;
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
deleted file mode 100644
index d48a4de..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.io;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
-import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
-import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.serialization.JsonMetadataList;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
-import org.apache.tika.sax.RecursiveParserWrapperHandler;
-import org.apache.tika.sax.ToTextContentHandler;
-import org.apache.tika.sax.ToXMLContentHandler;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-
-public class ExtractReader {
- private static final Logger LOG = LoggerFactory.getLogger(ExtractReader.class);
-
- public static final long IGNORE_LENGTH = -1L;
-
- public enum ALTER_METADATA_LIST {
- AS_IS, //leave the metadata list as is
- FIRST_ONLY, //take only the metadata list for the "container" document
- CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into the first
- }
-
- private TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
-
- private final ALTER_METADATA_LIST alterMetadataList;
- private final long minExtractLength;
- private final long maxExtractLength;
-
- /**
- * Reads full extract, no modification of metadata list, no min or max extract length checking
- */
- public ExtractReader() {
- this(ALTER_METADATA_LIST.AS_IS, IGNORE_LENGTH, IGNORE_LENGTH);
- }
-
- public ExtractReader(ALTER_METADATA_LIST alterMetadataList) {
- this(alterMetadataList, IGNORE_LENGTH, IGNORE_LENGTH);
- }
-
- public ExtractReader(ALTER_METADATA_LIST alterMetadataList, long minExtractLength, long maxExtractLength) {
- this.alterMetadataList = alterMetadataList;
- this.minExtractLength = minExtractLength;
- this.maxExtractLength = maxExtractLength;
- if (maxExtractLength > IGNORE_LENGTH && minExtractLength >= maxExtractLength) {
- throw new IllegalArgumentException("minExtractLength("+minExtractLength+
- ") must be < maxExtractLength("+maxExtractLength+")");
- }
- }
- public List<Metadata> loadExtract(Path extractFile) throws ExtractReaderException {
-
- List<Metadata> metadataList = null;
- if (extractFile == null || !Files.isRegularFile(extractFile)) {
- throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
- }
-
- FileSuffixes fileSuffixes = parseSuffixes(extractFile.getFileName().toString());
- if (fileSuffixes.format == null) {
- throw new ExtractReaderException(ExtractReaderException.TYPE.INCORRECT_EXTRACT_FILE_SUFFIX);
- }
- if (! Files.isRegularFile(extractFile)) {
- throw new ExtractReaderException(ExtractReaderException.TYPE.NO_EXTRACT_FILE);
- }
-
- long length = -1L;
- try {
- length = Files.size(extractFile);
- } catch (IOException e) {
- throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
- }
-
- if (length == 0L) {
- throw new ExtractReaderException(ExtractReaderException.TYPE.ZERO_BYTE_EXTRACT_FILE);
- }
-
- if (minExtractLength > IGNORE_LENGTH && length < minExtractLength) {
- throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_SHORT);
- }
- if (maxExtractLength > IGNORE_LENGTH && length > maxExtractLength) {
- throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_FILE_TOO_LONG);
- }
-
- Reader reader = null;
- InputStream is = null;
- try {
- is = Files.newInputStream(extractFile);
- if (fileSuffixes.compression != null) {
- if (fileSuffixes.compression.equals("bz2")) {
- is = new BZip2CompressorInputStream(is);
- } else if (fileSuffixes.compression.equals("gz")
- || fileSuffixes.compression.equals("gzip")) {
- is = new GzipCompressorInputStream(is);
- } else if (fileSuffixes.compression.equals("zip")) {
- is = new ZCompressorInputStream(is);
- } else {
- LOG.warn("Can't yet process compression of type: {}", fileSuffixes.compression);
- return metadataList;
- }
- }
- reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
- } catch (IOException e) {
- throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
- }
-
- try {
- if (fileSuffixes.format == FileSuffixes.FORMAT.JSON) {
- metadataList = JsonMetadataList.fromJson(reader);
- if (alterMetadataList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) {
- while (metadataList.size() > 1) {
- metadataList.remove(metadataList.size()-1);
- }
- } else if (alterMetadataList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) &&
- metadataList.size() > 1) {
- StringBuilder sb = new StringBuilder();
- Metadata containerMetadata = metadataList.get(0);
- for (Metadata m : metadataList) {
- String c = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
- if (c != null) {
- sb.append(c);
- sb.append(" ");
- }
- }
- containerMetadata.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, sb.toString());
- while (metadataList.size() > 1) {
- metadataList.remove(metadataList.size()-1);
- }
- }
- } else {
- metadataList = generateListFromTextFile(reader, fileSuffixes);
- }
- } catch (IOException e) {
- throw new ExtractReaderException(ExtractReaderException.TYPE.IO_EXCEPTION);
- } catch (TikaException e) {
- throw new ExtractReaderException(ExtractReaderException.TYPE.EXTRACT_PARSE_EXCEPTION);
- } finally {
- IOUtils.closeQuietly(reader);
- IOUtils.closeQuietly(is);
- }
- return metadataList;
- }
-
- private List<Metadata> generateListFromTextFile(Reader reader,
- FileSuffixes fileSuffixes) throws IOException {
- List<Metadata> metadataList = new ArrayList<>();
- String content = IOUtils.toString(reader);
- Metadata m = new Metadata();
- m.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content);
- if (fileSuffixes.format == FileSuffixes.FORMAT.HTML) {
- m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToXMLContentHandler.class.getSimpleName());
- } else if (fileSuffixes.format == FileSuffixes.FORMAT.TXT) {
- m.set(RecursiveParserWrapperHandler.TIKA_CONTENT_HANDLER, ToTextContentHandler.class.getSimpleName());
- }
- //Let's hope the file name has a suffix that can
- //be used to determine the mime. Could be wrong or missing,
- //but better than nothing.
- m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileSuffixes.originalFileName);
-
- MediaType mimeType = tikaConfig.getMimeRepository().detect(null, m);
- if (mimeType != null) {
- m.set(Metadata.CONTENT_TYPE, mimeType.toString());
- }
- metadataList.add(m);
- return metadataList;
-
- }
-
- protected static FileSuffixes parseSuffixes(String fName) {
- FileSuffixes fileSuffixes = new FileSuffixes();
- if (fName == null) {
- return fileSuffixes;
- }
- Matcher m = Pattern.compile("(?i)^(.*?)\\.(json|txt|x?html)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName);
- if (m.find()) {
- fileSuffixes.originalFileName = m.group(1);
- fileSuffixes.setFormat(m.group(2));
- fileSuffixes.compression = m.group(3);
- }
- return fileSuffixes;
- }
-
- private static class FileSuffixes {
-
- enum FORMAT {
- TXT,
- HTML,
- JSON
- }
- String compression;
- FORMAT format;
- String originalFileName;
-
- public void setFormat(String fmt) {
- String lc = fmt.toLowerCase(Locale.ENGLISH);
- if (lc.equals("json")) {
- format = FORMAT.JSON;
- } else if (lc.equals("txt")) {
- format = FORMAT.TXT;
- } else if (lc.contains("html")) {
- format = FORMAT.HTML;
- } else {
- throw new IllegalArgumentException("extract must end in .json, .txt or .xhtml");
- }
- }
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReaderException.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReaderException.java
deleted file mode 100644
index 5aa2f3f..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReaderException.java
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.io;
-
-import java.io.IOException;
-
-/**
- * Exception when trying to read extract
- */
-public class ExtractReaderException extends IOException {
-
- public enum TYPE {
- //what do you see when you look at the extract file
- NO_EXTRACT_FILE,
- ZERO_BYTE_EXTRACT_FILE,
- IO_EXCEPTION,
- EXTRACT_PARSE_EXCEPTION,
- EXTRACT_FILE_TOO_SHORT,
- EXTRACT_FILE_TOO_LONG,
- INCORRECT_EXTRACT_FILE_SUFFIX;//extract file must have suffix of .json or .txt,
- // optionally followed by gzip, zip or bz2
- }
-
- private final TYPE type;
-
- public ExtractReaderException(TYPE exceptionType) {
- this.type = exceptionType;
- }
-
- public TYPE getType() {
- return type;
- }
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java b/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java
deleted file mode 100644
index b2b76ab..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval.io;
-
-
-import java.io.IOException;
-import java.util.Map;
-
-import org.apache.tika.eval.db.Cols;
-import org.apache.tika.eval.db.TableInfo;
-
-public interface IDBWriter {
- public void writeRow(TableInfo table, Map<Cols, String> data) throws IOException;
- public void close() throws IOException;
- public int getMimeId(String mimeString);
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java
deleted file mode 100644
index 2d509f7..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.io;
-
-import java.io.IOException;
-import java.sql.SQLException;
-
-import org.apache.log4j.Level;
-
-public interface XMLLogMsgHandler {
- public void handleMsg(Level level, String xml) throws IOException, SQLException;
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
deleted file mode 100644
index 82c1b8b..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.io;
-
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.stream.XMLStreamConstants;
-import javax.xml.stream.XMLStreamException;
-import javax.xml.stream.XMLStreamReader;
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.StandardCharsets;
-import java.sql.SQLException;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.log4j.Level;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.utils.XMLReaderUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-
-public class XMLLogReader {
- private static final Logger LOG = LoggerFactory.getLogger(XMLLogReader.class);
- //class that wraps a logger's xml output
- //into a single xml parseable input stream.
-
- public void read(InputStream xmlLogFileIs, XMLLogMsgHandler handler) throws XMLStreamException {
- InputStream is = new LogXMLWrappingInputStream(xmlLogFileIs);
- XMLInputFactory factory = XMLReaderUtils.getXMLInputFactory();
- XMLStreamReader reader = factory.createXMLStreamReader(is);
-
- Level level = null;
- while (reader.hasNext()) {
- reader.next();
- switch (reader.getEventType()) {
- case XMLStreamConstants.START_ELEMENT :
- if ("event".equals(reader.getLocalName())) {
- level = Level.toLevel(reader.getAttributeValue("", "level"), Level.DEBUG);
- } else if ("message".equals(reader.getLocalName())) {
- try {
- handler.handleMsg(level, reader.getElementText());
- } catch (IOException e) {
- LOG.warn("Error parsing: {}", reader.getElementText());
- } catch (SQLException e) {
- LOG.warn("SQLException: {}", e.getMessage());
- }
- }
- break;
- case XMLStreamConstants.END_ELEMENT :
- if ("event".equals(reader.getLocalName())) {
- level = null;
- } else if ("message".equals(reader.getLocalName())) {
- //do we care any more?
- }
- break;
- };
- }
- }
-
-
-
- class LogXMLWrappingInputStream extends InputStream {
- //plagiarized from log4j's chainsaw
- private final static String HEADER =
- "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
- + "<log4j:eventSet version=\"1.2\" "
- + "xmlns:log4j=\"http://jakarta.apache.org/log4j/\">";
- private static final String FOOTER = "</log4j:eventSet>";
-
- private InputStream[] streams;
- int currentStreamIndex = 0;
-
- private LogXMLWrappingInputStream(InputStream xmlLogFileIs){
- streams = new InputStream[3];
- streams[0] = new ByteArrayInputStream(HEADER.getBytes(StandardCharsets.UTF_8));
- streams[1] = xmlLogFileIs;
- streams[2] = new ByteArrayInputStream(FOOTER.getBytes(StandardCharsets.UTF_8));
-
- }
-
- @Override
- public int read() throws IOException {
- int c = streams[currentStreamIndex].read();
- if (c < 0) {
- IOUtils.closeQuietly(streams[currentStreamIndex]);
- while (currentStreamIndex < streams.length-1) {
- currentStreamIndex++;
- int tmpC = streams[currentStreamIndex].read();
- if (tmpC < 0) {
- IOUtils.closeQuietly(streams[currentStreamIndex]);
- } else {
- return tmpC;
- }
- }
- return -1;
- }
- return c;
- }
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
deleted file mode 100644
index 41a1424..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
+++ /dev/null
@@ -1,198 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.reports;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.sql.Connection;
-import java.sql.ResultSet;
-import java.sql.ResultSetMetaData;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.sql.Types;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.CellStyle;
-import org.apache.poi.ss.usermodel.Row;
-import org.apache.poi.ss.usermodel.VerticalAlignment;
-import org.apache.poi.xssf.streaming.SXSSFSheet;
-import org.apache.poi.xssf.streaming.SXSSFWorkbook;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * This class represents a single report.
- */
-public class Report {
- private static final Logger LOG = LoggerFactory.getLogger(Report.class);
-
- final String NULL_VALUE = "";//TODO: make this configurable!!!
- Map<String, XSLXCellFormatter> cellFormatters = new HashMap<>();
- private XLSXNumFormatter defaultDoubleFormatter = new XLSXNumFormatter("0.000");
- private XLSXNumFormatter defaultIntegerFormatter = new XLSXNumFormatter("0");
- private CellStyle sqlCellStyle;
-
- String sql;
- String reportFilename;
- boolean includeSql = true;
-
- String reportName;
-
- public void writeReport(Connection c, Path reportsRoot) throws SQLException, IOException {
- LOG.info("Writing report: {} to {}", reportName, reportFilename);
- dumpXLSX(c, reportsRoot);
- }
-
- private void dumpXLSX(Connection c, Path reportsRoot) throws IOException, SQLException {
- Statement st = c.createStatement();
- Path out = reportsRoot.resolve(reportFilename);
- Files.createDirectories(out.getParent());
-
- SXSSFWorkbook wb = new SXSSFWorkbook(new XSSFWorkbook(), 100, true, true);
- wb.setCompressTempFiles(true);
- defaultIntegerFormatter.reset(wb.getXSSFWorkbook());
- defaultDoubleFormatter.reset(wb.getXSSFWorkbook());
- sqlCellStyle = wb.createCellStyle();
- sqlCellStyle.setVerticalAlignment(VerticalAlignment.TOP);
- sqlCellStyle.setWrapText(true);
-
-
- try {
- dumpReportToWorkbook(st, wb);
- } finally {
- try (OutputStream os = Files.newOutputStream(out)) {
- wb.write(os);
- } finally {
- wb.dispose();
- }
- }
- }
-
- private void dumpReportToWorkbook(Statement st, SXSSFWorkbook wb) throws IOException, SQLException {
- ResultSet rs = st.executeQuery(sql);
-
- SXSSFSheet sheet = wb.createSheet("tika-eval Report");
- sheet.trackColumnForAutoSizing(0);
-
- int rowCount = 0;
- ResultSetMetaData meta = rs.getMetaData();
- Set<String> colNames = new HashSet<>();
-
- Row xssfRow = sheet.createRow(rowCount++);
- //write headers and cache them to check against styles
- for (int i = 1; i <= meta.getColumnCount(); i++) {
- Cell cell = xssfRow.createCell(i-1);
- cell.setCellValue(meta.getColumnLabel(i));
- colNames.add(meta.getColumnLabel(i));
- }
-
- ResultSetMetaData resultSetMetaData = rs.getMetaData();
- while (rs.next()) {
- xssfRow = sheet.createRow(rowCount++);
- for (int i = 1; i <= meta.getColumnCount(); i++) {
- Cell cell = xssfRow.createCell(i-1);
- XSLXCellFormatter formatter = cellFormatters.get(meta.getColumnLabel(i));
- if (formatter == null) {
- formatter = getDefaultFormatter(resultSetMetaData.getColumnType(i));
- }
- if (formatter != null) {
- formatter.applyStyleAndValue(i, rs, cell);
- } else {
- writeCell(meta, i, rs, cell);
- }
- }
- }
- sheet.autoSizeColumn(0);
-
- if (!includeSql) {
- return;
- }
-
- SXSSFSheet sqlSheet = wb.createSheet("tika-eval SQL");
- sqlSheet.setColumnWidth(0, 100*250);
- Row sqlRow = sqlSheet.createRow(0);
- short height = 5000;
- sqlRow.setHeight(height);
- Cell cell = sqlRow.createCell(0);
- cell.setCellStyle(sqlCellStyle);
-
- cell.setCellValue(sql.trim());//.replaceAll("[\r\n]+", "\r\n"));
- }
-
- private XSLXCellFormatter getDefaultFormatter(int columnType) {
- switch (columnType) {
- case Types.INTEGER :
- return defaultIntegerFormatter;
- case Types.DOUBLE:
- case Types.FLOAT:
- case Types.DECIMAL:
- return defaultDoubleFormatter;
- default:
- return null;
- }
- }
-
- private void writeCell(ResultSetMetaData meta, int colIndex, ResultSet rs,
- Cell cell) throws SQLException {
-
- switch(meta.getColumnType(colIndex)) {
- //fall through on numerics
- case Types.BIGINT:
- case Types.SMALLINT:
- case Types.INTEGER:
- case Types.DOUBLE:
- case Types.FLOAT:
- case Types.DECIMAL:
- case Types.REAL:
- case Types.NUMERIC:
- double dbl = rs.getDouble(colIndex);
- if (rs.wasNull()) {
- cell.setCellValue(NULL_VALUE);
- } else {
- cell.setCellValue(dbl);
- }
- break;
- //fall through strings
- case Types.BOOLEAN:
- case Types.CHAR:
- case Types.VARCHAR:
- case Types.LONGNVARCHAR:
- String val = rs.getString(colIndex);
- if (rs.wasNull()) {
- cell.setCellValue(NULL_VALUE);
- } else {
- cell.setCellValue(val);
- }
- break;
- default:
- if (rs.wasNull()) {
- cell.setCellValue(NULL_VALUE);
- } else {
- cell.setCellValue(rs.getString(colIndex));
- }
- LOG.warn("Couldn't find type for: {}. Defaulting to String", meta.getColumnType(colIndex));
- }
- }
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
deleted file mode 100644
index 3ede4e6..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.reports;
-
-
-import javax.xml.parsers.DocumentBuilder;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.nio.file.StandardCopyOption;
-import java.sql.Connection;
-import java.sql.DatabaseMetaData;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-import java.sql.Statement;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.DefaultParser;
-import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.Options;
-import org.apache.commons.cli.ParseException;
-import org.apache.poi.common.usermodel.HyperlinkType;
-import org.apache.tika.eval.ExtractComparer;
-import org.apache.tika.eval.ExtractProfiler;
-import org.apache.tika.eval.db.H2Util;
-import org.apache.tika.eval.db.JDBCUtil;
-import org.apache.tika.utils.XMLReaderUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.w3c.dom.Document;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-
-public class ResultsReporter {
- private static final Logger LOG = LoggerFactory.getLogger(ResultsReporter.class);
-
- private static Options OPTIONS;
-
- static {
- OPTIONS = new Options();
- OPTIONS.addOption("rd", "reportsDir", true, "directory for the reports. " +
- "If not specified, will write to 'reports'" +
- "BEWARE: Will overwrite existing reports without warning!")
- .addOption("rf", "reportsFile", true, "xml specifying sql to call for the reports." +
- "If not specified, will use default reports in resources/tika-eval-*-config.xml")
- .addOption("db", true, "default database (in memory H2). Specify a file name for the H2 database.")
- .addOption("jdbc", true, "EXPERT: full jdbc connection string. Specify this or use -db <h2db_name>")
- .addOption("jdbcdriver", true, "EXPERT: specify the jdbc driver class if all else fails")
- .addOption("tablePrefix", true, "EXPERT: if not using the default tables, specify your table name prefix");
-
- }
-
- public static void USAGE() {
- HelpFormatter helpFormatter = new HelpFormatter();
- helpFormatter.printHelp(
- 80,
- "java -jar tika-eval-x.y.jar Report -db mydb [-rd myreports] [-rf myreports.xml]",
- "Tool: Report",
- ResultsReporter.OPTIONS,
- "Note: for h2 db, do not include the .mv.db at the end of the db name.");
-
- }
-
-
- List<String> before = new ArrayList<>();
- List<String> after = new ArrayList<>();
- List<Report> reports = new ArrayList<>();
-
-
- private void addBefore(String b) {
- before.add(b);
- }
-
- private void addAfter(String a) {
- after.add(a);
- }
-
- private void addReport(Report r) {
- reports.add(r);
- }
-
- public static ResultsReporter build(Path p) throws Exception {
-
- ResultsReporter r = new ResultsReporter();
-
- DocumentBuilder docBuilder = XMLReaderUtils.getDocumentBuilder();
- Document doc;
- try (InputStream is = Files.newInputStream(p)) {
- doc = docBuilder.parse(is);
- }
- Node docElement = doc.getDocumentElement();
- assert (docElement.getNodeName().equals("reports"));
- NodeList children = docElement.getChildNodes();
- for (int i = 0; i < children.getLength(); i++) {
- Node n = children.item(i);
- if ("before".equals(n.getNodeName())) {
- for (String before : getSql(n)) {
- r.addBefore(before);
- }
- } else if ("after".equals(n.getNodeName())) {
- for (String after : getSql(n)) {
- r.addAfter(after);
- }
- } else if ("report".equals(n.getNodeName())) {
- Report report = buildReport(n);
- r.addReport(report);
- }
- }
-
- return r;
- }
-
- private static Report buildReport(Node n) {
- NodeList children = n.getChildNodes();
- Report r = new Report();
- NamedNodeMap attrs = n.getAttributes();
-
- r.includeSql = Boolean.parseBoolean(attrs.getNamedItem("includeSql").getNodeValue());
- r.reportFilename = attrs.getNamedItem("reportFilename").getNodeValue();
- r.reportName = attrs.getNamedItem("reportName").getNodeValue();
-
- for (int i = 0; i < children.getLength(); i++) {
- Node child = children.item(i);
- if (child.getNodeType() != 1) {
- continue;
- }
- if ("sql".equals(child.getNodeName())) {
- if (r.sql != null) {
- throw new IllegalArgumentException("Can only have one sql statement per report");
- }
- r.sql = child.getTextContent();
- } else if ("colformats".equals(child.getNodeName())) {
- r.cellFormatters = getCellFormatters(child);
- } else {
- throw new IllegalArgumentException("Not expecting to see:" + child.getNodeName());
- }
- }
- return r;
- }
-
- private static Map<String, XSLXCellFormatter> getCellFormatters(Node n) {
- NodeList children = n.getChildNodes();
- Map<String, XSLXCellFormatter> ret = new HashMap<>();
- for (int i = 0; i < children.getLength(); i++) {
- Node child = children.item(i);
- if (child.getNodeType() != 1) {
- continue;
- }
- NamedNodeMap attrs = child.getAttributes();
- String columnName = attrs.getNamedItem("name").getNodeValue();
- assert (!ret.containsKey(columnName));
- String type = attrs.getNamedItem("type").getNodeValue();
- if ("numberFormatter".equals(type)) {
- String format = attrs.getNamedItem("format").getNodeValue();
- XSLXCellFormatter f = new XLSXNumFormatter(format);
- ret.put(columnName, f);
- } else if ("urlLink".equals(type)) {
- String base = "";
- Node baseNode = attrs.getNamedItem("base");
- if (baseNode != null) {
- base = baseNode.getNodeValue();
- }
- XLSXHREFFormatter f = new XLSXHREFFormatter(base, HyperlinkType.URL);
- ret.put(columnName, f);
- } else if ("fileLink".equals(type)) {
- String base = "";
- Node baseNode = attrs.getNamedItem("base");
- if (baseNode != null) {
- base = baseNode.getNodeValue();
- }
- XLSXHREFFormatter f = new XLSXHREFFormatter(base, HyperlinkType.FILE);
- ret.put(columnName, f);
- }
- }
- return ret;
- }
-
- private static List<String> getSql(Node n) {
- List<String> ret = new ArrayList<>();
-
- NodeList children = n.getChildNodes();
-
- for (int i = 0; i < children.getLength(); i++) {
- Node child = children.item(i);
- if (child.getNodeType() != 1) {
- continue;
- }
- ret.add(child.getTextContent());
- }
- return ret;
- }
-
- public static void main(String[] args) throws Exception {
-
- DefaultParser defaultCLIParser = new DefaultParser();
- CommandLine commandLine = null;
- try {
- commandLine = defaultCLIParser.parse(OPTIONS, args);
- } catch (ParseException e) {
- System.out.println(e.getMessage());
- USAGE();
- return;
- }
- JDBCUtil dbUtil = null;
- if (commandLine.hasOption("db")) {
- String dbString = commandLine.getOptionValue("db");
- if (dbString.endsWith(".mv.db")) {
- dbString = dbString.substring(0, dbString.length()-6);
- LOG.debug("trimming .mv.db from db name");
- }
- Path db = Paths.get(dbString);
- if (!H2Util.databaseExists(db)) {
- throw new RuntimeException("I'm sorry, but I couldn't find this h2 database: " + db);
- }
- dbUtil = new H2Util(db);
- } else if (commandLine.hasOption("jdbc")) {
- String driverClass = null;
- if (commandLine.hasOption("jdbcdriver")) {
- driverClass = commandLine.getOptionValue("jdbcdriver");
- }
- dbUtil = new JDBCUtil(commandLine.getOptionValue("jdbc"), driverClass);
- } else {
- System.err.println("Must specify either -db for the default in-memory h2 database\n" +
- "or -jdbc for a full jdbc connection string");
- USAGE();
- return;
- }
- try (Connection c = dbUtil.getConnection()) {
- Path tmpReportsFile = null;
- try {
- ResultsReporter resultsReporter = null;
- String reportsFile = commandLine.getOptionValue("rf");
- if (reportsFile == null) {
- tmpReportsFile = getDefaultReportsConfig(c);
- resultsReporter = ResultsReporter.build(tmpReportsFile);
- } else {
- resultsReporter = ResultsReporter.build(Paths.get(reportsFile));
- }
-
- Path reportsRootDirectory = Paths.get(commandLine.getOptionValue("rd", "reports"));
- if (Files.isDirectory(reportsRootDirectory)) {
- LOG.warn("'Reports' directory exists. Will overwrite existing reports.");
- }
-
- resultsReporter.execute(c, reportsRootDirectory);
- } finally {
- if (tmpReportsFile != null) {
- Files.delete(tmpReportsFile);
- }
- }
- }
- }
-
- private static Path getDefaultReportsConfig(Connection c) throws IOException, SQLException {
- DatabaseMetaData md = c.getMetaData();
- String internalPath = null;
- try (ResultSet rs = md.getTables(null, null, "%", null)) {
- while (rs.next()) {
- String tName = rs.getString(3);
- if (ExtractComparer.CONTENTS_TABLE_B.getName().equalsIgnoreCase(tName)) {
- internalPath = "/comparison-reports.xml";
- break;
- } else if (ExtractProfiler.PROFILE_TABLE.getName().equalsIgnoreCase(tName)) {
- internalPath = "/profile-reports.xml";
- break;
- }
- }
- }
-
- if (internalPath == null) {
- throw new RuntimeException("Couldn't determine if this database was a 'profiler' or 'comparison' db");
- }
- Path tmp = Files.createTempFile("tmp-tika-reports", ".xml");
- Files.copy(ResultsReporter.class.getResourceAsStream(internalPath), tmp, StandardCopyOption.REPLACE_EXISTING);
- return tmp;
- }
-
- public void execute(Connection c, Path reportsDirectory) throws IOException, SQLException {
- Statement st = c.createStatement();
- for (String sql : before) {
- LOG.info("processing before: {}", sql);
- st.execute(sql);
- }
- for (Report r : reports) {
- r.writeReport(c, reportsDirectory);
- }
- for (String sql : after) {
- LOG.info("processing after: {}", sql);
- st.execute(sql);
- }
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java
deleted file mode 100644
index 11b72f4..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.reports;
-
-import java.nio.file.Paths;
-import java.sql.ResultSet;
-import java.sql.SQLException;
-
-import org.apache.poi.common.usermodel.HyperlinkType;
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.CellStyle;
-import org.apache.poi.ss.usermodel.Font;
-import org.apache.poi.ss.usermodel.Hyperlink;
-import org.apache.poi.ss.usermodel.IndexedColors;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-
-import javax.swing.event.HyperlinkListener;
-
-
-public class XLSXHREFFormatter implements XSLXCellFormatter {
- //xlsx files can only have this many hyperlinks
- //if they have more Excel can't read the file
- private static final int MAX_HYPERLINKS = 65000;
-
-
- private final String urlBase;
- private final HyperlinkType linkType;
- private XSSFWorkbook workbook;
- private CellStyle style;
- private int links = 0;
-
- public XLSXHREFFormatter(String urlBase,
- HyperlinkType hyperlinkType) {
- this.urlBase = urlBase;
- this.linkType = hyperlinkType;
- }
-
- @Override
- public void reset(XSSFWorkbook workbook) {
- this.workbook = workbook;
- style = workbook.createCellStyle();
- Font hlinkFont = workbook.createFont();
- hlinkFont.setUnderline(Font.U_SINGLE);
- hlinkFont.setColor(IndexedColors.BLUE.getIndex());
- style.setFont(hlinkFont);
- links = 0;
-
- }
-
- @Override
- public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException {
- if (links < MAX_HYPERLINKS) {
- Hyperlink hyperlink = workbook.getCreationHelper().createHyperlink(linkType);
- String path = resultSet.getString(dbColNum);
- String address = urlBase+path;
- hyperlink.setAddress(address);
- cell.setHyperlink(hyperlink);
- cell.setCellStyle(style);
- String fName = Paths.get(path).getFileName().toString();
- cell.setCellValue(fName);
- links++;
- } else {
- //silently stop adding hyperlinks
- }
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java
deleted file mode 100644
index b2ba5b0..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.tika.eval.reports;
-
-import java.sql.ResultSet;
-import java.sql.SQLException;
-
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.ss.usermodel.CellStyle;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-
-class XLSXNumFormatter implements XSLXCellFormatter {
-
- private final String formatString;
- private CellStyle style;
-
- XLSXNumFormatter(String formatString) {
- this.formatString = formatString;
- }
-
-
- @Override
- public void reset(XSSFWorkbook workbook) {
- style = workbook.createCellStyle();
- style.setDataFormat(workbook.getCreationHelper()
- .createDataFormat().getFormat(formatString));
- }
-
- @Override
- public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException {
- double d = resultSet.getDouble(dbColNum);
- if (resultSet.wasNull()) {
-
- } else {
- cell.setCellStyle(style);
- }
- cell.setCellValue(resultSet.getDouble(dbColNum));
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java
deleted file mode 100644
index eb0e024..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.reports;
-
-
-import java.sql.ResultSet;
-import java.sql.SQLException;
-
-import org.apache.poi.ss.usermodel.Cell;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-
-interface XSLXCellFormatter {
-
- public void reset(XSSFWorkbook workbook);
- public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException;
-}
\ No newline at end of file
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/BatchTopCommonTokenCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/BatchTopCommonTokenCounter.java
deleted file mode 100644
index 70d828e..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/BatchTopCommonTokenCounter.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.tika.utils.ProcessUtils;
-
-/**
- * Utility class that runs TopCommonTokenCounter against a directory
- * of table files (named {lang}_table.gz or leipzip-like afr_...-sentences.txt)
- * and outputs common tokens files for each input table file in the output directory.
- */
-public class BatchTopCommonTokenCounter {
-
- public static void main(String[] args) throws Exception {
-
- Path commonTokensDir = Paths.get(args[0]);
- Path tableFileDir = Paths.get(args[1]);
- Map<String, List<Path>> langFiles = LeipzigHelper.getFiles(tableFileDir);
-
- for (Map.Entry<String, List<Path>> e : langFiles.entrySet()) {
-
- String[] cmd = new String[e.getValue().size()+1];
- Path commonTokensFile = commonTokensDir.resolve(e.getKey());
- cmd[0] = ProcessUtils.escapeCommandLine(commonTokensFile.toAbsolutePath().toString());
- for (int i = 0; i < e.getValue().size(); i++) {
- cmd[i+1] =
- ProcessUtils.escapeCommandLine(
- e.getValue().get(i).toAbsolutePath().toString());
- }
- TopCommonTokenCounter.main(
- cmd
- );
- }
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/CommonTokenOverlapCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/CommonTokenOverlapCounter.java
deleted file mode 100644
index 3bd8198..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/CommonTokenOverlapCounter.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-import java.util.Set;
-
-import org.apache.tika.eval.tokens.CommonTokenCountManager;
-
-public class CommonTokenOverlapCounter {
-
- public static void main(String[] args) throws Exception {
- Path commonTokensDir = Paths.get(args[0]);
- CommonTokenOverlapCounter counter = new CommonTokenOverlapCounter();
- counter.execute(commonTokensDir);
- }
-
- private void execute(Path commonTokensDir) throws IOException {
- List<String> langs = new ArrayList<>();
- for (File f : commonTokensDir.toFile().listFiles()) {
- langs.add(f.getName());
- }
- CommonTokenCountManager mgr = new CommonTokenCountManager(commonTokensDir, "");
- for (int i = 0; i < langs.size()-1; i++) {
- for (int j = i+1; j < langs.size(); j++) {
- compare(langs.get(i), langs.get(j), mgr);
- }
- }
- }
-
- private void compare(String langA, String langB, CommonTokenCountManager mgr) {
- int overlap = 0;
- int denom = 0;
- Set<String> setA = mgr.getTokens(langA);
- Set<String> setB = mgr.getTokens(langB);
- for (String a : setA) {
- if (setB.contains(a)) {
- overlap += 2;
- }
- }
- denom = setA.size()+setB.size();
- double percent = (double)overlap/(double)denom;
- if (percent > 0.01) {
- System.out.println(String.format(Locale.US, "%s %s %.2f", langA, langB, percent));
- }
- }
-
-
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigHelper.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigHelper.java
deleted file mode 100644
index daddd25..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigHelper.java
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Path;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Map;
-import java.util.TreeMap;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-public class LeipzigHelper {
-
- static Map<String, List<Path>> getFiles(Path leipzigDir) throws IOException {
- Matcher tableMatcher = Pattern.compile("([a-z]+)_table(\\.txt)?(\\.gz)?$").matcher("");
- Matcher leipzigMatcher = Pattern.compile("([a-z]{3,3})[-_].*$").matcher("");
-
- Map<String, List<Path>> m = new TreeMap<>();
- for (File f : leipzigDir.toFile().listFiles()) {
- System.err.println(f);
- String lang = null;
- if (tableMatcher.reset(f.getName()).find()) {
- lang = tableMatcher.group(1);
- } else if (leipzigMatcher.reset(f.getName()).find()) {
- lang = leipzigMatcher.group(1);
- }
- if (lang == null) {
- System.err.println("couldn't find a lang: " + f);
- continue;
- }
- List<Path> files = m.get(lang);
- if (files == null) {
- files = new ArrayList<>();
- }
- files.add(f.toPath());
- m.put(lang, files);
- }
- return m;
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigSampler.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigSampler.java
deleted file mode 100644
index 5250505..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/LeipzigSampler.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import java.util.Map;
-
-public class LeipzigSampler {
-
- public static void main(String[] args) throws Exception {
- int sentsPerLanguage = 10;
- Path leipzigDir = Paths.get(args[0]);
- Path sampleFile = Paths.get(args[1]);
- LeipzigSampler leipzigSampler = new LeipzigSampler();
- try (BufferedWriter writer = Files.newBufferedWriter(sampleFile, StandardCharsets.UTF_8)) {
- leipzigSampler.execute(leipzigDir, sentsPerLanguage, writer);
- }
- }
-
- private void execute(Path leipzigDir, int sentsPerLang, BufferedWriter writer) throws IOException {
- Map<String, List<Path>> fileMap = LeipzigHelper.getFiles(leipzigDir);
- for (Map.Entry<String, List<Path>> e : fileMap.entrySet()) {
- List<String> sentences = new ArrayList<>();
- for (Path p : e.getValue()) {
- addSentences(p, sentences);
- }
- Collections.shuffle(sentences);
- String lang = e.getKey();
- for (int i = 0; i < sentsPerLang; i++) {
- writer.write(row(lang, sentences.get(i)));
- }
- }
- }
-
- private void addSentences(Path p, List<String> sentences) throws IOException {
- try (BufferedReader reader = Files.newBufferedReader(p, StandardCharsets.UTF_8)) {
- String line = reader.readLine();
- while (line != null) {
- int tab = line.indexOf("\t");
- if (tab > -1) {
- line = line.substring(tab + 1);
- }
- sentences.add(line);
- line = reader.readLine();
- }
- }
- }
-
- private String row(String lang, String s) {
- s = s.replaceAll("\\s+", " ");
- return lang+"\t"+s+"\n";
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
deleted file mode 100644
index 66283ff..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/SlowCompositeReaderWrapper.java
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
-
-
-import org.apache.lucene.index.BinaryDocValues;
-import org.apache.lucene.index.CompositeReader;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.DocValues;
-import org.apache.lucene.index.DocValuesType;
-import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.FieldInfos;
-import org.apache.lucene.index.Fields;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.LeafMetaData;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.MultiBits;
-import org.apache.lucene.index.MultiDocValues;
-import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
-import org.apache.lucene.index.MultiReader;
-import org.apache.lucene.index.MultiTerms;
-import org.apache.lucene.index.NumericDocValues;
-import org.apache.lucene.index.OrdinalMap;
-import org.apache.lucene.index.PointValues;
-import org.apache.lucene.index.SortedDocValues;
-import org.apache.lucene.index.SortedNumericDocValues;
-import org.apache.lucene.index.SortedSetDocValues;
-import org.apache.lucene.index.StoredFieldVisitor;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.util.Bits;
-import org.apache.lucene.util.Version;
-
-/**
- * <b>COPIED VERBATIM FROM LUCENE</b>
- * This class forces a composite reader (eg a {@link
- * MultiReader} or {@link DirectoryReader}) to emulate a
- * {@link LeafReader}. This requires implementing the postings
- * APIs on-the-fly, using the static methods in {@link
- * MultiTerms}, {@link MultiDocValues}, by stepping through
- * the sub-readers to merge fields/terms, appending docs, etc.
- *
- * <p><b>NOTE</b>: this class almost always results in a
- * performance hit. If this is important to your use case,
- * you'll get better performance by gathering the sub readers using
- * {@link IndexReader#getContext()} to get the
- * leaves and then operate per-LeafReader,
- * instead of using this class.
- */
-
-public final class SlowCompositeReaderWrapper extends LeafReader {
-
- private final CompositeReader in;
- private final LeafMetaData metaData;
-
- // Cached copy of FieldInfos to prevent it from being re-created on each
- // getFieldInfos call. Most (if not all) other LeafReader implementations
- // also have a cached FieldInfos instance so this is consistent. SOLR-12878
- private final FieldInfos fieldInfos;
-
- final Map<String, Terms> cachedTerms = new ConcurrentHashMap<>();
-
- // TODO: consider ConcurrentHashMap ?
- // TODO: this could really be a weak map somewhere else on the coreCacheKey,
- // but do we really need to optimize slow-wrapper any more?
- final Map<String, OrdinalMap> cachedOrdMaps = new HashMap<>();
-
- /** This method is sugar for getting an {@link LeafReader} from
- * an {@link IndexReader} of any kind. If the reader is already atomic,
- * it is returned unchanged, otherwise wrapped by this class.
- */
- public static LeafReader wrap(IndexReader reader) throws IOException {
- if (reader instanceof CompositeReader) {
- return new SlowCompositeReaderWrapper((CompositeReader) reader);
- } else {
- assert reader instanceof LeafReader;
- return (LeafReader) reader;
- }
- }
-
- SlowCompositeReaderWrapper(CompositeReader reader) throws IOException {
- in = reader;
- in.registerParentReader(this);
- if (reader.leaves().isEmpty()) {
- metaData = new LeafMetaData(Version.LATEST.major, Version.LATEST, null);
- } else {
- Version minVersion = Version.LATEST;
- for (LeafReaderContext leafReaderContext : reader.leaves()) {
- Version leafVersion = leafReaderContext.reader().getMetaData().getMinVersion();
- if (leafVersion == null) {
- minVersion = null;
- break;
- } else if (minVersion.onOrAfter(leafVersion)) {
- minVersion = leafVersion;
- }
- }
- metaData = new LeafMetaData(reader.leaves().get(0).reader().getMetaData().getCreatedVersionMajor(), minVersion, null);
- }
- fieldInfos = FieldInfos.getMergedFieldInfos(in);
- }
-
- @Override
- public String toString() {
- return "SlowCompositeReaderWrapper(" + in + ")";
- }
-
- @Override
- public CacheHelper getReaderCacheHelper() {
- return in.getReaderCacheHelper();
- }
-
- @Override
- public CacheHelper getCoreCacheHelper() {
- // TODO: this is trappy as the expectation is that core keys live for a long
- // time, but here we need to bound it to the lifetime of the wrapped
- // composite reader? Unfortunately some features seem to rely on this...
- return in.getReaderCacheHelper();
- }
-
- @Override
- public Terms terms(String field) throws IOException {
- ensureOpen();
- try {
- return cachedTerms.computeIfAbsent(field, f -> {
- try {
- return MultiTerms.getTerms(in, f);
- } catch (IOException e) { // yuck! ...sigh... checked exceptions with built-in lambdas are a pain
- throw new RuntimeException("unwrapMe", e);
- }
- });
- } catch (RuntimeException e) {
- if (e.getMessage().equals("unwrapMe") && e.getCause() instanceof IOException) {
- throw (IOException) e.getCause();
- }
- throw e;
- }
- }
-
- @Override
- public NumericDocValues getNumericDocValues(String field) throws IOException {
- ensureOpen();
- return MultiDocValues.getNumericValues(in, field); // TODO cache?
- }
-
- @Override
- public BinaryDocValues getBinaryDocValues(String field) throws IOException {
- ensureOpen();
- return MultiDocValues.getBinaryValues(in, field); // TODO cache?
- }
-
- @Override
- public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
- ensureOpen();
- return MultiDocValues.getSortedNumericValues(in, field); // TODO cache?
- }
-
- @Override
- public SortedDocValues getSortedDocValues(String field) throws IOException {
- ensureOpen();
- OrdinalMap map = null;
- synchronized (cachedOrdMaps) {
- map = cachedOrdMaps.get(field);
- if (map == null) {
- // uncached, or not a multi dv
- SortedDocValues dv = MultiDocValues.getSortedValues(in, field);
- if (dv instanceof MultiSortedDocValues) {
- map = ((MultiSortedDocValues)dv).mapping;
- CacheHelper cacheHelper = getReaderCacheHelper();
- if (cacheHelper != null && map.owner == cacheHelper.getKey()) {
- cachedOrdMaps.put(field, map);
- }
- }
- return dv;
- }
- }
- int size = in.leaves().size();
- final SortedDocValues[] values = new SortedDocValues[size];
- final int[] starts = new int[size+1];
- long totalCost = 0;
- for (int i = 0; i < size; i++) {
- LeafReaderContext context = in.leaves().get(i);
- final LeafReader reader = context.reader();
- final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
- if (fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED) {
- return null;
- }
- SortedDocValues v = reader.getSortedDocValues(field);
- if (v == null) {
- v = DocValues.emptySorted();
- }
- totalCost += v.cost();
- values[i] = v;
- starts[i] = context.docBase;
- }
- starts[size] = maxDoc();
- return new MultiSortedDocValues(values, starts, map, totalCost);
- }
-
- @Override
- public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
- ensureOpen();
- OrdinalMap map = null;
- synchronized (cachedOrdMaps) {
- map = cachedOrdMaps.get(field);
- if (map == null) {
- // uncached, or not a multi dv
- SortedSetDocValues dv = MultiDocValues.getSortedSetValues(in, field);
- if (dv instanceof MultiDocValues.MultiSortedSetDocValues) {
- map = ((MultiDocValues.MultiSortedSetDocValues)dv).mapping;
- CacheHelper cacheHelper = getReaderCacheHelper();
- if (cacheHelper != null && map.owner == cacheHelper.getKey()) {
- cachedOrdMaps.put(field, map);
- }
- }
- return dv;
- }
- }
-
- assert map != null;
- int size = in.leaves().size();
- final SortedSetDocValues[] values = new SortedSetDocValues[size];
- final int[] starts = new int[size+1];
- long cost = 0;
- for (int i = 0; i < size; i++) {
- LeafReaderContext context = in.leaves().get(i);
- final LeafReader reader = context.reader();
- final FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
- if(fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.SORTED_SET){
- return null;
- }
- SortedSetDocValues v = reader.getSortedSetDocValues(field);
- if (v == null) {
- v = DocValues.emptySortedSet();
- }
- values[i] = v;
- starts[i] = context.docBase;
- cost += v.cost();
- }
- starts[size] = maxDoc();
- return new MultiDocValues.MultiSortedSetDocValues(values, starts, map, cost);
- }
-
- @Override
- public NumericDocValues getNormValues(String field) throws IOException {
- ensureOpen();
- return MultiDocValues.getNormValues(in, field); // TODO cache?
- }
-
- @Override
- public Fields getTermVectors(int docID) throws IOException {
- ensureOpen();
- return in.getTermVectors(docID);
- }
-
- @Override
- public int numDocs() {
- // Don't call ensureOpen() here (it could affect performance)
- return in.numDocs();
- }
-
- @Override
- public int maxDoc() {
- // Don't call ensureOpen() here (it could affect performance)
- return in.maxDoc();
- }
-
- @Override
- public void document(int docID, StoredFieldVisitor visitor) throws IOException {
- ensureOpen();
- in.document(docID, visitor);
- }
-
- @Override
- public Bits getLiveDocs() {
- ensureOpen();
- return MultiBits.getLiveDocs(in); // TODO cache?
- }
-
- @Override
- public PointValues getPointValues(String field) {
- ensureOpen();
- return null; // because not supported. Throw UOE?
- }
-
- @Override
- public FieldInfos getFieldInfos() {
- return fieldInfos;
- }
-
- @Override
- protected void doClose() throws IOException {
- // TODO: as this is a wrapper, should we really close the delegate?
- in.close();
- }
-
- @Override
- public void checkIntegrity() throws IOException {
- ensureOpen();
- for (LeafReaderContext ctx : in.leaves()) {
- ctx.reader().checkIntegrity();
- }
- }
-
- @Override
- public LeafMetaData getMetaData() {
- return metaData;
- }
-}
\ No newline at end of file
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
deleted file mode 100644
index b63546b..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/TopCommonTokenCounter.java
+++ /dev/null
@@ -1,401 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Objects;
-import java.util.Set;
-
-import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
-import org.apache.commons.io.FileUtils;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.DirectoryReader;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.FSDirectory;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.PriorityQueue;
-import org.apache.tika.eval.tokens.AnalyzerManager;
-import org.apache.tika.eval.tokens.URLEmailNormalizingFilterFactory;
-import org.apache.tika.utils.ProcessUtils;
-
-/**
- * Utility class that reads in a UTF-8 input file with one document per row
- * and outputs the 20000 tokens with the highest document frequencies.
- *
- * The CommmonTokensAnalyzer intentionally drops tokens shorter than 4 characters,
- * but includes bigrams for cjk.
- *
- * It also has a include list for __email__ and __url__ and a skip list
- * for common html markup terms.
- */
-public class TopCommonTokenCounter {
-
- private static String LICENSE =
- "# Licensed to the Apache Software Foundation (ASF) under one or more\n" +
- "# contributor license agreements. See the NOTICE file distributed with\n" +
- "# this work for additional information regarding copyright ownership.\n" +
- "# The ASF licenses this file to You under the Apache License, Version 2.0\n" +
- "# (the \"License\"); you may not use this file except in compliance with\n" +
- "# the License. You may obtain a copy of the License at\n" +
- "#\n" +
- "# http://www.apache.org/licenses/LICENSE-2.0\n" +
- "#\n" +
- "# Unless required by applicable law or agreed to in writing, software\n" +
- "# distributed under the License is distributed on an \"AS IS\" BASIS,\n" +
- "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" +
- "# See the License for the specific language governing permissions and\n" +
- "# limitations under the License.\n"+
- "#\n";
-
- private static final String FIELD = "f";
- private static int TOP_N = 30000;
- private static int MIN_DOC_FREQ = 10;
- //these should exist in every list
- static Set<String> INCLUDE_LIST = new HashSet<>(Arrays.asList(
- new String[] {
- URLEmailNormalizingFilterFactory.URL,
- URLEmailNormalizingFilterFactory.EMAIL
- }
- ));
-
- //words to ignore
- //these are common 4 letter html markup words that we do
- //not want to count in case of failed markup processing.
- //see: https://issues.apache.org/jira/browse/TIKA-2267?focusedCommentId=15872055&page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel#comment-15872055
- static Set<String> SKIP_LIST = new HashSet<>(Arrays.asList(
- "span",
- "table",
- "href",
- "head",
- "title",
- "body",
- "html",
- "tagname",
- "lang",
- "style",
- "script",
- "strong",
- "blockquote",
- "form",
- "iframe",
- "section",
- "colspan",
- "rowspan"
- ));
-
- public static void main(String[] args) throws Exception {
- Path commonTokensFile = Paths.get(args[0]);
- List<Path> inputFiles = new ArrayList<>();
- for (int i = 1; i < args.length; i++) {
- inputFiles.add(Paths.get(
- ProcessUtils.unescapeCommandLine(args[i])));
- }
- TopCommonTokenCounter counter = new TopCommonTokenCounter();
- if (Files.exists(commonTokensFile)) {
- System.err.println(commonTokensFile.getFileName().toString()+
- " exists. I'm skipping this.");
- return;
- }
- counter.execute(commonTokensFile, inputFiles);
- }
-
- private void execute(Path commonTokensFile, List<Path> inputFiles) throws Exception {
- Path luceneDir = Files.createTempDirectory("tika-eval-lucene-");
- AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
- long totalDocs = -1;
- long sumDocFreqs = -1;
- long sumTotalTermFreqs = -1;
- long uniqueTerms = -1;
- try (Directory directory = FSDirectory.open(luceneDir)) {
-
- AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);
-
- Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
- IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
- int maxLen = 1000000;
- int len = 0;
- try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) {
- List<Document> docs = new ArrayList<>();
- for (Path inputFile : inputFiles) {
- //total hack
- boolean isLeipzig = false;
- if (inputFile.getFileName().toString().contains("-sentences.txt")) {
- isLeipzig = true;
- }
- int lines = 0;
- try (BufferedReader reader = getReader(inputFile)) {
- String line = reader.readLine();
- while (line != null) {
- if (isLeipzig) {
- int tab = line.indexOf("\t");
- if (tab > -1) {
- line = line.substring(tab+1);
- }
- }
- len += line.length();
- Document document = new Document();
- document.add(new TextField(FIELD, line, Field.Store.NO));
- docs.add(document);
- if (len > maxLen) {
- writer.addDocuments(docs);
- docs.clear();
- len = 0;
- }
- line = reader.readLine();
- if (++lines % 100000 == 0) {
- System.out.println("processed "+lines +
- " for "+inputFile.getFileName()
- + " :: "+ commonTokensFile.toAbsolutePath());
- }
- }
- }
- }
- if (docs.size() > 0) {
- writer.addDocuments(docs);
- }
- writer.commit();
- writer.flush();
- }
-
- try (IndexReader reader = DirectoryReader.open(directory)) {
- LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
- totalDocs = wrappedReader.getDocCount(FIELD);
- sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
- sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);
-
- Terms terms = wrappedReader.terms(FIELD);
- TermsEnum termsEnum = terms.iterator();
- BytesRef bytesRef = termsEnum.next();
- int docsWThisField = wrappedReader.getDocCount(FIELD);
- while (bytesRef != null) {
- uniqueTerms++;
- int df = termsEnum.docFreq();
- long tf = termsEnum.totalTermFreq();
- if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
- bytesRef = termsEnum.next();
- continue;
- }
-
- if (queue.top() == null || queue.size() < TOP_N ||
- df >= queue.top().df) {
- String t = bytesRef.utf8ToString();
- if (! SKIP_LIST.contains(t)) {
- queue.insertWithOverflow(new TokenDFTF(t, df, tf));
- }
-
- }
- bytesRef = termsEnum.next();
- }
- }
- } finally {
- FileUtils.deleteDirectory(luceneDir.toFile());
- }
-
- writeTopN(commonTokensFile, totalDocs,
- sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);
-
-
- }
-
- private BufferedReader getReader(Path inputFile) throws IOException {
- InputStream is = Files.newInputStream(inputFile);
- if (inputFile.toString().endsWith(".gz")) {
- is = new GzipCompressorInputStream(is);
- }
- return new BufferedReader(
- new InputStreamReader(is, StandardCharsets.UTF_8)
- );
- }
-
- private static void writeTopN(Path path,
- long totalDocs, long sumDocFreqs,
- long sumTotalTermFreqs,
- long uniqueTerms, AbstractTokenTFDFPriorityQueue queue) throws IOException {
- if (Files.isRegularFile(path)) {
- System.err.println("File "+path.getFileName() + " already exists. Skipping.");
- return;
- }
- Files.createDirectories(path.getParent());
- BufferedWriter writer =
- Files.newBufferedWriter(path, StandardCharsets.UTF_8);
- StringBuilder sb = new StringBuilder();
- writer.write(LICENSE);
- writer.write("#DOC_COUNT\t"+totalDocs+"\n");
- writer.write("#SUM_DOC_FREQS\t"+sumDocFreqs+"\n");
- writer.write("#SUM_TERM_FREQS\t"+sumTotalTermFreqs+"\n");
- writer.write("#UNIQUE_TERMS\t"+uniqueTerms+"\n");
- writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
- //add these tokens no matter what
- for (String t : INCLUDE_LIST) {
- writer.write(t);
- writer.newLine();
- }
- for (TokenDFTF tp : queue.getArray()) {
- writer.write(getRow(sb, tp)+"\n");
-
- }
- writer.flush();
- writer.close();
- }
-
- private static String getRow(StringBuilder sb, TokenDFTF tp) {
- sb.setLength(0);
- sb.append(clean(tp.token));
- sb.append("\t").append(tp.df);
- sb.append("\t").append(tp.tf);
- return sb.toString();
- }
-
- private static String clean(String s) {
- if (s == null) {
- return "";
- }
- return s.replaceAll("\\s+", " ").trim();
- }
-
- private abstract class AbstractTokenTFDFPriorityQueue extends PriorityQueue<TokenDFTF> {
-
- AbstractTokenTFDFPriorityQueue(int maxSize) {
- super(maxSize);
- }
-
- public TokenDFTF[] getArray() {
- TokenDFTF[] topN = new TokenDFTF[size()];
- //now we reverse the queue
- TokenDFTF term = pop();
- int i = topN.length-1;
- while (term != null && i > -1) {
- topN[i--] = term;
- term = pop();
- }
- return topN;
- }
- }
-
- private class TokenDFTF {
-
- final String token;
- final int df;
- final long tf;
-
- public TokenDFTF(String token, int df, long tf) {
- this.token = token;
- this.df = df;
- this.tf = tf;
- }
-
-
- public long getTF() {
- return tf;
- }
-
- public int getDF() {
- return df;
- }
-
- public String getToken() {
- return token;
- }
-
- @Override
- public boolean equals(Object o) {
- if (this == o) {
- return true;
- }
- if (o == null || getClass() != o.getClass()) {
- return false;
- }
-
- TokenDFTF tokenDFTF = (TokenDFTF) o;
-
- if (df != tokenDFTF.df) {
- return false;
- }
- if (tf != tokenDFTF.tf) {
- return false;
- }
- return Objects.equals(token, tokenDFTF.token);
- }
-
- @Override
- public int hashCode() {
- int result = token != null ? token.hashCode() : 0;
- result = 31 * result + df;
- result = 31 * result + (int) (tf ^ (tf >>> 32));
- return result;
- }
-
- @Override
- public String toString() {
- return "TokenDFTF{" +
- "token='" + token + '\'' +
- ", df=" + df +
- ", tf=" + tf +
- '}';
- }
- }
-
- private class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {
-
- TokenDFPriorityQueue(int maxSize) {
- super(maxSize);
- }
-
- @Override
- protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
- if (arg0.df < arg1.df) {
- return true;
- } else if (arg0.df > arg1.df) {
- return false;
- }
- return arg1.token.compareTo(arg0.token) < 0;
- }
-
- public TokenDFTF[] getArray() {
- TokenDFTF[] topN = new TokenDFTF[size()];
- //now we reverse the queue
- TokenDFTF term = pop();
- int i = topN.length-1;
- while (term != null && i > -1) {
- topN[i--] = term;
- term = pop();
- }
- return topN;
- }
- }
-}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tools/TrainTestSplit.java b/tika-eval/src/main/java/org/apache/tika/eval/tools/TrainTestSplit.java
deleted file mode 100644
index d6a70df..0000000
--- a/tika-eval/src/main/java/org/apache/tika/eval/tools/TrainTestSplit.java
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.eval.tools;
-
-import java.io.BufferedOutputStream;
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Random;
-
-import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
-
-public class TrainTestSplit {
-
- private static String TRAINING = "train";
- private static String TESTING = "test";
- private static String DEVTEST = "devtest";
-
- private float trainingP = 0.7f;
- private float devTestP = 0.1f;
- private float testP = 0.2f;
- private Random random = new Random();
-
- public static void main(String[] args) throws Exception {
- Path leipzigDir = Paths.get(args[0]);
- Path outputDir = Paths.get(args[1]);
- TrainTestSplit splitter = new TrainTestSplit();
- splitter.execute(leipzigDir, outputDir);
- }
-
- private void execute(Path leipzigDir, Path outputDir) throws Exception {
- initOutDirs(outputDir);
- for (File f : leipzigDir.toFile().listFiles()) {
- if (f.isDirectory()) {
- continue;
- }
- processFile(f, outputDir);
- }
- }
-
- private void initOutDirs(Path outputDir) throws Exception {
- for (String which : new String[] { TRAINING, DEVTEST, TESTING}) {
- Path target = outputDir.resolve(which);
- if (! Files.isDirectory(target)) {
- Files.createDirectories(target);
- }
- }
-
- }
-
- private void processFile(File f, Path outputDir) throws Exception {
- Map<String, BufferedWriter> writers = getWriters(outputDir, f);
- System.err.println("working on "+f);
- try (BufferedReader reader = Files.newBufferedReader(f.toPath(), StandardCharsets.UTF_8)) {
- String line = reader.readLine();
- while (line != null) {
- float r = random.nextFloat();
- if (r <= trainingP) {
- writers.get(TRAINING).write(line + "\n");
- } else if (r < trainingP + devTestP) {
- writers.get(DEVTEST).write(line + "\n");
- } else {
- writers.get(TESTING).write(line+"\n");
- }
- line = reader.readLine();
- }
- }
-
-
- for (Writer w : writers.values()) {
- w.flush();
- w.close();
- }
- }
-
- private Map<String, BufferedWriter> getWriters(Path outputDir, File f) throws IOException {
- Map<String, BufferedWriter> writers = new HashMap<>();
- for (String which : new String[] { TRAINING, DEVTEST, TESTING}) {
- writers.put(which, getWriter(outputDir, which, f));
- }
- return writers;
- }
-
- private BufferedWriter getWriter(Path outputDir, String which, File f) throws IOException {
- OutputStream os = new GzipCompressorOutputStream(
- new BufferedOutputStream(
- Files.newOutputStream(outputDir.resolve(which).resolve(f.getName()+".gz"))));
- return new BufferedWriter(new OutputStreamWriter(os, StandardCharsets.UTF_8));
- }
-}
diff --git a/tika-eval/src/main/resources/comparison-reports-pg.xml b/tika-eval/src/main/resources/comparison-reports-pg.xml
deleted file mode 100644
index 5bcf88e..0000000
--- a/tika-eval/src/main/resources/comparison-reports-pg.xml
+++ /dev/null
@@ -1,1753 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-<reports>
-
-
- <before>
-
- <sql>drop table if exists md5_multiples_tmp_a</sql>
- <sql>create table md5_multiples_tmp_a (MD5, cnt)
- as
- select md5, count(1) as cnt
- from profiles_a
- where md5 is not null
- group by md5
- having count(1) > 1
- order by cnt desc
- </sql>
-
- <sql>drop table if exists md5_multiples_tmp_b</sql>
- <sql>create table md5_multiples_tmp_b (MD5, cnt)
- as
- select md5, count(1) cnt
- from profiles_b
- where md5 is not null
- group by md5
- having count(1) > 1
- order by cnt desc
- </sql>
- <!-- build mime indexes -->
-
- <sql>create index if not exists pa_m_idx
- on profiles_a (mime_id);
- </sql>
-
- <sql>
- create index if not exists pb_m_idx
- on profiles_b (mime_id);
- </sql>
-
- <!-- build exceptions comparison table -->
- <sql>drop table if exists exceptions_compared</sql>
- <sql>
- create table exceptions_compared (
- mime_id_a integer,
- mime_id_b integer,
- total integer,
- exc_cnt_a integer,
- exc_cnt_b integer,
- exc_prcnt_a float,
- exc_prcnt_b float,
- notes varchar(12)
- );
- </sql>
- <sql>
- insert into exceptions_compared (
- select ma.mime_id, mb.mime_id, count(1) as total, 0, 0, 0.0, 0.0, ''
- from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- join mimes ma on pa.mime_id = ma.mime_id
- join mimes mb on pb.mime_id = mb.mime_id
- group by ma.mime_id, mb.mime_id
- order by total desc );
- </sql>
-
- <sql>
- update exceptions_compared ec set
- exc_cnt_a = (
- select count(1) as cnt
- from exceptions_a ea
- join profiles_a pa on ea.id=pa.id
- join profiles_b pb on pb.id=pa.id
- join mimes ma on pa.mime_id=ma.mime_id
- join mimes mb on pb.mime_id=mb.mime_id
- where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
- group by ma.mime_id, mb.mime_id);
- </sql>
- <sql>
- update exceptions_compared ec set
- exc_cnt_b = (
- select count(1) as cnt
- from exceptions_b eb
- join profiles_b pb on eb.id=pb.id
- join profiles_a pa on pa.id=pb.id
- join mimes ma on pa.mime_id=ma.mime_id
- join mimes mb on pb.mime_id=mb.mime_id
- where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
- group by mb.mime_id, ma.mime_id);
- </sql>
- <sql>
- update exceptions_compared
- set exc_prcnt_a = cast(exc_cnt_a as decimal)/cast(total as decimal)
- where total > 0;
- </sql>
- <sql>
- update exceptions_compared
- set exc_prcnt_b = cast(exc_cnt_b as decimal)/cast(total as decimal)
- where total > 0;
- </sql>
-
- <sql>
- update exceptions_compared
- set notes = 'YAY!'
- where total > 100 and (exc_prcnt_a-exc_prcnt_b) > 0.10;
- </sql>
- <sql>
- update exceptions_compared
- set notes = 'YIKES!'
- where total > 100 and (exc_prcnt_b-exc_prcnt_a) > 0.10;
- </sql>
-
- <!-- build tmp common words table -->
- <sql>drop table if exists token_counts_compared</sql>
- <sql>
- create table token_counts_compared
- (mime_id_a integer,
- mime_id_b integer,
- num_tokens_a bigint default 0,
- num_tokens_b bigint default 0,
- num_alphabetic_tokens_a bigint default 0,
- num_alphabetic_tokens_b bigint default 0,
- num_common_tokens_a bigint default 0,
- num_common_tokens_b bigint default 0
- );
- </sql>
- <sql>
- insert into token_counts_compared (mime_id_a, mime_id_b)
- select ma.mime_id, mb.mime_id
- from profiles_a a
- join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_id=a.mime_id
- join mimes mb on mb.mime_id=b.mime_id
- group by ma.mime_id, mb.mime_id
-
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_tokens_a=(
- select sum(num_tokens) as cnt from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- join contents_a c on c.id = pa.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_tokens_b=(
- select sum(num_tokens) as cnt from profiles_b pb
- join profiles_a pa on pa.id=pb.id
- join contents_b c on c.id = pb.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_alphabetic_tokens_a=(
- select sum(num_alphabetic_tokens) as cnt from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- join contents_a c on c.id = pa.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_alphabetic_tokens_b=(
- select sum(num_alphabetic_tokens) as cnt from profiles_b pb
- join profiles_a pa on pb.id=pa.id
- join contents_b c on c.id = pb.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_common_tokens_a=(
- select sum(num_common_tokens) as cnt from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- join contents_a c on c.id = pa.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_common_tokens_b=(
- select sum(num_common_tokens) as cnt from profiles_b pb
- join profiles_a pa on pa.id=pb.id
- join contents_b c on c.id = pb.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>drop table if exists tags_by_mime</sql>
- <sql>create table tags_by_mime (
- mime_id_a integer,
- mime_id_b integer,
- tags_a_a integer,
- tags_b_a integer,
- tags_div_a integer,
- tags_i_a integer,
- tags_img_a integer,
- tags_li_a integer,
- tags_ol_a integer,
- tags_p_a integer,
- tags_table_a integer,
- tags_td_a integer,
- tags_title_a integer,
- tags_tr_a integer,
- tags_u_a integer,
- tags_ul_a integer,
- tags_a_b integer,
- tags_b_b integer,
- tags_div_b integer,
- tags_i_b integer,
- tags_img_b integer,
- tags_li_b integer,
- tags_ol_b integer,
- tags_p_b integer,
- tags_table_b integer,
- tags_td_b integer,
- tags_title_b integer,
- tags_tr_b integer,
- tags_u_b integer,
- tags_ul_b integer
- );
- </sql>
- <sql>
- insert into tags_by_mime (mime_id_a, mime_id_b)
- select ma.mime_id, mb.mime_id
- from profiles_a a
- join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_id=a.mime_id
- join mimes mb on mb.mime_id=b.mime_id
- group by ma.mime_id, mb.mime_id
- </sql>
- <sql>
- update tags_by_mime tbm set tags_a_a=(
- select sum(ta.tags_a) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_b_a=(
- select sum(ta.tags_b) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_div_a=(
- select sum(ta.tags_div) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_i_a=(
- select sum(ta.tags_i) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_img_a=(
- select sum(ta.tags_img) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_li_a=(
- select sum(ta.tags_li) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_ol_a=(
- select sum(ta.tags_ol) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_p_a=(
- select sum(ta.tags_p) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_table_a=(
- select sum(ta.tags_table) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_td_a=(
- select sum(ta.tags_td) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_title_a=(
- select sum(ta.tags_title) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_tr_a=(
- select sum(ta.tags_tr) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_u_a=(
- select sum(ta.tags_u) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_ul_a=(
- select sum(ta.tags_ul) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <!-- now update tags_b counts -->
- <sql>
- update tags_by_mime tbm set tags_a_b=(
- select sum(tb.tags_a) as cnt from tags_b tb
- join tags_a ta on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_b_b=(
- select sum(tb.tags_b) as cnt from tags_b tb
- join tags_a ta on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_div_b=(
- select sum(tb.tags_div) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_i_b=(
- select sum(tb.tags_i) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_img_b=(
- select sum(tb.tags_img) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_li_b=(
- select sum(tb.tags_li) as cnt from tags_b tb
- join tags_a ta on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_ol_b=(
- select sum(tb.tags_ol) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_p_b=(
- select sum(tb.tags_p) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_table_b=(
- select sum(tb.tags_table) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_td_b=(
- select sum(tb.tags_td) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_title_b=(
- select sum(tb.tags_title) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_tr_b=(
- select sum(tb.tags_tr) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_u_b=(
- select sum(tb.tags_u) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_ul_b=(
- select sum(tb.tags_ul) as cnt from tags_b tb
- join tags_a ta on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>drop table if exists tag_exceptions_by_mime</sql>
- <sql>create table tag_exceptions_by_mime (
- mime_id_a integer,
- mime_id_b integer,
- tag_exceptions_a integer,
- tag_exceptions_b integer)
- </sql>
- <sql>
- insert into tag_exceptions_by_mime (mime_id_a, mime_id_b,
- tag_exceptions_a, tag_exceptions_b)
- select ma.mime_id, mb.mime_id,0,0
- from profiles_a a
- join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_id=a.mime_id
- join mimes mb on mb.mime_id=b.mime_id
- group by ma.mime_id, mb.mime_id
- </sql>
- <sql>
- update tag_exceptions_by_mime tebm set tag_exceptions_a=(
- select count(1) as cnt from tags_a ta
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tebm.mime_id_b
- and pa.mime_id=tebm.mime_id_a
- and ta.tags_parse_exception=true
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tag_exceptions_by_mime tebm set tag_exceptions_b=(
- select count(1) as cnt from tags_b tb
- join profiles_a pa on pa.id=tb.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tebm.mime_id_b
- and pa.mime_id=tebm.mime_id_a
- and tb.tags_parse_exception=true
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- drop table if exists parse_time_compared;
- </sql>
- <sql>
- create table parse_time_compared (
- mime_id_a integer,
- mime_id_b integer,
- total_a bigint,
- total_b bigint,
- prcnt_increase double precision
- );
- </sql>
- <sql>
- insert into parse_time_compared (mime_id_a, mime_id_b,
- total_a, total_b, prcnt_increase)
- select ma.mime_id, mb.mime_id,0,0,0.0
- from profiles_a a
- join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_id=a.mime_id
- join mimes mb on mb.mime_id=b.mime_id
- group by ma.mime_id, mb.mime_id
- </sql>
- <sql>
- update parse_time_compared ptc set total_a=(
- select sum(pa.elapsed_time_millis) as total_a from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- where pa.mime_id= ptc.mime_id_a
- and pb.mime_id=ptc.mime_id_b
- group by mime_id_a, mime_id_b)
- </sql>
- <sql>
- update parse_time_compared ptc set total_b=(
- select sum(pb.elapsed_time_millis) as total_b from profiles_b pb
- join profiles_a pa on pa.id=pb.id
- where pa.mime_id= ptc.mime_id_a
- and pb.mime_id=ptc.mime_id_b
- group by mime_id_a, mime_id_b)
- </sql>
- <sql>
- update parse_time_compared ptc set prcnt_increase=(100.0 *
- cast(total_b as decimal)/cast(total_a as decimal))
- where total_a > 0;
- </sql>
- </before>
-
- <!-- MIMES -->
- <report reportName="All Mimes In A"
- reportFilename="mimes/all_mimes_A.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) cnt from
- profiles_a p
- join mimes m on m.mime_id = p.mime_id
- group by mime_string
- order by cnt desc
- </sql>
- </report>
-
- <report reportName="All Mimes In B"
- reportFilename="mimes/all_mimes_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) cnt from
- profiles_b p
- join mimes m on m.mime_id = p.mime_id
- group by mime_string
- order by cnt desc
- </sql>
- </report>
- <report reportName="Container Mimes In A"
- reportFilename="mimes/container_mimes_A.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) cnt from
- profiles_a p
- join mimes m on m.mime_id = p.mime_id
- where is_embedded=false
- group by mime_string
- order by cnt desc
- </sql>
- </report>
-
- <report reportName="Container Mimes In B"
- reportFilename="mimes/container_mimes_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) cnt from
- profiles_b p
- join mimes m on m.mime_id = p.mime_id
- where is_embedded=false
- group by mime_string
- order by cnt desc
- </sql>
- </report>
- <report reportName="Embedded Mimes In A"
- reportFilename="mimes/embedded_mimes_A.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) cnt from
- profiles_a p
- join mimes m on m.mime_id = p.mime_id
- where is_embedded=true
- group by mime_string
- order by cnt desc
- </sql>
- </report>
-
- <report reportName="Embedded Mimes In B"
- reportFilename="mimes/embedded_mimes_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) cnt from
- profiles_b p
- join mimes m on m.mime_id = p.mime_id
- where is_embedded=true
- group by mime_string
- order by cnt desc
- </sql>
- </report>
- <report reportName="Mime Differences A -> B"
- reportFilename="mimes/mime_diffs_A_to_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select concat(ma.mime_string, ' -> ', mb.mime_string) as
- MIME_A_TO_MIME_B, count(1) as COUNT
- from profiles_a a
- join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_id=a.mime_id
- join mimes mb on mb.mime_id=b.mime_id
- where a.mime_id <> b.mime_id
- group by MIME_A_TO_MIME_B
- order by COUNT DESC
- </sql>
- </report>
-
- <report reportName="Mime Differences A -> B Details"
- reportFilename="mimes/mime_diffs_A_to_B_details.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select concat(ma.mime_string, ' -> ', mb.mime_string) as
- MIME_A_TO_MIME_B,
- file_path,
- c.length as CONTAINER_LENGTH,
- a.file_name
- from profiles_a a
- join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_id=a.mime_id
- join mimes mb on mb.mime_id=b.mime_id
- join containers c on a.container_id=c.container_id
- where a.mime_id <> b.mime_id
- order by MIME_A_TO_MIME_B
- </sql>
- </report>
-
-
- <!-- Exceptions -->
- <report reportName="AllExceptionsByMimeA"
- reportFilename="exceptions/exceptions_by_mime_A.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) cnt from
- exceptions_a e
- join profiles_a p on p.id=e.id
- join mimes m on m.mime_id = p.mime_id
- group by mime_string
- order by cnt desc
- </sql>
- </report>
- <report reportName="AllExceptionsByMimeB"
- reportFilename="exceptions/exceptions_by_mime_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) cnt from
- exceptions_b e
- join profiles_b p on p.id=e.id
- join mimes m on m.mime_id = p.mime_id
- group by mime_string
- order by cnt desc
- </sql>
- </report>
-
- <report reportName="ContainerExceptionsByMimeA"
- reportFilename="exceptions/container_exceptions_by_mime_A.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) cnt from
- exceptions_a e
- join profiles_a p on p.id=e.id
- join mimes m on m.mime_id = p.mime_id
- where is_embedded=false
- and parse_exception_id=0
- group by mime_string
- order by cnt desc
- </sql>
- </report>
-
- <report reportName="ContainerExceptionsByMimeB"
- reportFilename="exceptions/container_exceptions_by_mime_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) cnt from
- exceptions_b e
- join profiles_b p on p.id=e.id
- join mimes m on m.mime_id = p.mime_id
- where is_embedded=false
- and parse_exception_id=0
- group by mime_string
- order by cnt desc
- </sql>
- </report>
- <report reportName="AllExceptionsByMimeByTypeA"
- reportFilename="exceptions/exceptions_by_mime_by_type_A.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string as MIME_TYPE,
- parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
- from exceptions_a e
- join profiles_a p on p.id=e.id
- join containers c on p.container_id=c.container_id
- join mimes m on m.mime_id=p.mime_id
- join ref_parse_exception_types r on
- r.parse_exception_id=e.parse_exception_id
- group by m.mime_string, parse_exception_description
- order by MIME_TYPE, EXCEPTION_TYPE
- </sql>
- </report>
-
- <report reportName="AllExceptionsByMimeByTypeB"
- reportFilename="exceptions/exceptions_by_mime_by_type_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string as MIME_TYPE,
- parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
- from exceptions_b e
- join profiles_b p on p.id=e.id
- join containers c on p.container_id=c.container_id
- join mimes m on m.mime_id=p.mime_id
- join ref_parse_exception_types r on
- r.parse_exception_id=e.parse_exception_id
- group by m.mime_string, parse_exception_description
- order by MIME_TYPE, EXCEPTION_TYPE
- </sql>
- </report>
-
- <report reportName="TextLostFromACausedByNewExceptionsInB"
- reportFilename="exceptions/text_lost_from_A_caused_by_new_exceptions_in_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path as FILE_PATH,
- c.length as CONTAINER_LENGTH,
- ca.NUM_TOKENS as NUM_TOKENS_A,
- cb.NUM_TOKENS as NUM_TOKENS_B,
- ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A, cb.NUM_UNIQUE_TOKENS
- as NUM_UNIQUE_TOKENS_B,
- ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
- ca.num_common_tokens as NUM_COMMON_TOKENS_A,
- cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
- cb.num_common_tokens as NUM_COMMON_TOKENS_B,
- ca.top_n_tokens as TOP_N_TOKENS_A, cb.top_n_tokens as TOP_N_TOKENS_B,
- eb.ORIG_STACK_TRACE as ORIG_STACK_TRACE_B
- from contents_a ca
- join profiles_a pa on ca.id = pa.id
- join containers c on pa.container_id=c.container_id
- left join contents_b cb on ca.id=cb.id
- left join exceptions_b eb on ca.id = eb.id
- left join exceptions_a ea on ca.id = ea.id
- where eb.orig_stack_trace is not null
- and ea.orig_stack_trace is null
- order by ca.num_common_tokens - coalesce(cb.num_common_tokens,0) desc
- </sql>
- </report>
-
- <report reportName="FixedExceptionsInBByMimeType"
- reportFilename="exceptions/fixed_exceptions_in_B_by_mime.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select
- ma.mime_string as MIME_TYPE_A,
- mb.mime_string as MIME_TYPE_B,
- count(1) as COUNT
- from exceptions_a ea
- left join exceptions_b eb on ea.id = eb.id
- join profiles_a pa on pa.id=ea.id
- join profiles_b pb on pa.id=pb.id
- join containers c on pa.container_id=c.container_id
- join mimes ma on ma.mime_id=pa.mime_id
- join mimes mb on mb.mime_id=pb.mime_id
- where eb.id is null
- and ea.parse_exception_id=0
- group by mime_type_a, mime_type_b
- </sql>
- </report>
-
- <report reportName="FixedExceptionsInByDetails"
- reportFilename="exceptions/fixed_exceptions_in_B_details.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select
- file_path,
- c.length as CONTAINER_LENGTH,
- ma.mime_string as MIME_TYPE_A,
- mb.mime_string as MIME_TYPE_B,
- pa.file_name, pa.is_embedded
- from exceptions_a ea
- left join exceptions_b eb on ea.id = eb.id
- join profiles_a pa on pa.id=ea.id
- join profiles_b pb on pb.id=pa.id --this ensures that files were actually processed in both runs
- join containers c on pa.container_id=c.container_id
- join mimes ma on ma.mime_id=pa.mime_id
- join mimes mb on mb.mime_id=pb.mime_id
- where eb.id is null
- and ea.parse_exception_id=0
- order by mime_type_a, mime_type_b
- </sql>
- </report>
- <report reportName="ContentsOfFixedExceptionsInB"
- reportFilename="exceptions/contents_of_fixed_exceptions_in_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path,
- c.length as CONTAINER_LENGTH,
- ma.mime_string as MIME_TYPE_A,
- mb.mime_string as MIME_TYPE_B,
- CONTENT_LENGTH,
- NUM_TOKENS, NUM_UNIQUE_TOKENS,
- TOP_N_TOKENS, LANG_ID_1,TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV
- from exceptions_a ea
- left join exceptions_b eb on ea.id = eb.id
- join profiles_a pa on pa.id=ea.id
- join profiles_b pb on pa.id=pb.id
- join contents_b cb on cb.id=ea.id
- join containers c on pa.container_id=c.container_id
- join mimes ma on ma.mime_id=pa.mime_id
- join mimes mb on mb.mime_id=pb.mime_id
- where eb.id is null
- and ea.parse_exception_id=0
- </sql>
- </report>
-
- <report reportName="NewExceptionsByMimeType"
- reportFilename="exceptions/new_exceptions_in_B_by_mime.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select ma.mime_string as MIME_TYPE_A, mb.mime_string as MIME_TYPE_B, count(1) as COUNT
- from exceptions_b eb
- left join exceptions_a ea on ea.id = eb.id
- join profiles_a pa on pa.id=eb.id
- join profiles_b pb on pb.id=pa.id
- join containers c on pa.container_id=c.container_id
- join mimes ma on ma.mime_id=pa.mime_id
- join mimes mb on mb.mime_id=pb.mime_id
- where ea.id is null
- and eb.parse_exception_id=0
- group by ma.mime_string, mb.mime_string
- order by COUNT desc
- </sql>
- </report>
-
- <report reportName="NewExceptionsInBByMimeTypeByStackTrace"
- reportFilename="exceptions/new_exceptions_in_B_by_mime_by_stack_trace.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select
- ma.MIME_STRING as MIME_TYPE_A,
- mb.MIME_STRING as MIME_TYPE_B,
- eb.sort_stack_trace, count(1) as
- COUNT
- from exceptions_b eb
- left join exceptions_a ea on ea.id = eb.id
- join profiles_a pa on pa.id=eb.id
- join profiles_b pb on pb.id=eb.id
- join mimes ma on ma.mime_id=pa.mime_id
- join mimes mb on mb.mime_id=pb.mime_id
- where ea.id is null
- and eb.parse_exception_id=0
- group by MIME_TYPE_A, MIME_TYPE_B, eb.sort_stack_trace
- order by MIME_TYPE_A asc, MIME_TYPE_B asc, COUNT desc
- </sql>
- </report>
-
- <report reportName="NewExceptionsInBDetails"
- reportFilename="exceptions/new_exceptions_in_B_details.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path,
- c.length as CONTAINER_LENGTH,
- ma.mime_string as MIME_TYPE_A,
- mb.mime_string as MIME_TYPE_B,
- eb.orig_stack_trace, eb.sort_stack_trace
- from exceptions_b eb
- left join exceptions_a ea on ea.id = eb.id
- join profiles_a pa on pa.id=eb.id
- join profiles_b pb on pb.id=eb.id
- join containers c on pa.container_id=c.container_id
- join mimes ma on ma.mime_id=pa.mime_id
- join mimes mb on mb.mime_id=pb.mime_id
- where ea.id is null
- and eb.parse_exception_id=0
- order by MIME_TYPE_A asc, MIME_TYPE_B asc, eb.ORIG_STACK_TRACE
- </sql>
- </report>
-
- <report reportName="StackTracesByMimeInA"
- reportFilename="exceptions/stack_traces_by_mime_A.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
- COUNT
- from exceptions_a e
- join profiles_a p on p.id=e.id
- join mimes m on m.mime_id=p.mime_id
- and e.parse_exception_id=0
- group by MIME_TYPE, e.sort_stack_trace
- order by MIME_TYPE asc, COUNT desc
- </sql>
- </report>
-
- <report reportName="AllStackTracesInA"
- reportFilename="exceptions/stack_traces_A.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path,
- c.length as CONTAINER_LENGTH,
- mime_string as MIME_TYPE,
- orig_stack_trace, sort_stack_trace
- from exceptions_a e
- join profiles_a p on p.id=e.id
- join containers c on p.container_id=c.container_id
- join mimes m on m.mime_id=p.mime_id
- and e.parse_exception_id=0
- order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
- CONTAINER_LENGTH asc
- </sql>
- </report>
- <report reportName="AllStackTracesInB"
- reportFilename="exceptions/stack_traces_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path,
- c.length as CONTAINER_LENGTH,
- mime_string as MIME_TYPE,
- orig_stack_trace, sort_stack_trace
- from exceptions_b e
- join profiles_b p on p.id=e.id
- join containers c on p.container_id=c.container_id
- join mimes m on m.mime_id=p.mime_id
- and e.parse_exception_id=0
- order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
- CONTAINER_LENGTH asc
- </sql>
- </report>
-
- <report reportName="StackTracesByMimeInB"
- reportFilename="exceptions/stack_traces_by_mime_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
- COUNT
- from exceptions_b e
- join profiles_b p on p.id=e.id
- join mimes m on m.mime_id=p.mime_id
- and e.parse_exception_id=0
- group by MIME_TYPE, e.sort_stack_trace
- order by MIME_TYPE asc, COUNT desc
- </sql>
- </report>
- <report reportName="extractExceptionsA"
- reportFilename="exceptions/extract_exceptions_a.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select file_path, extract_exception_description
- from extract_exceptions_a e
- join ref_extract_exception_types t
- on e.extract_exception_id=t.extract_exception_id
- </sql>
- </report>
- <report reportName="extractExceptionsB"
- reportFilename="exceptions/extract_exceptions_b.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select file_path, extract_exception_description
- from extract_exceptions_b e
- join ref_extract_exception_types t
- on e.extract_exception_id=t.extract_exception_id
- </sql>
- </report>
- <report reportName="parseExceptionTypesA"
- reportFilename="exceptions/overall_exception_types_a.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select parse_exception_description, count(1)
- from exceptions_a e
- join ref_parse_exception_types t on
- t.parse_exception_id=e.parse_exception_id
- group by t.parse_exception_description
- </sql>
- </report>
- <report reportName="parseExceptionTypesB"
- reportFilename="exceptions/overall_exception_types_b.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select parse_exception_description, count(1)
- from exceptions_b e
- join ref_parse_exception_types t on
- t.parse_exception_id=e.parse_exception_id
- group by t.parse_exception_description
- </sql>
- </report>
-
- <report reportName="contentDiffsWExceptions"
- reportFilename="content/content_diffs_with_exceptions.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path,
- c.length as CONTAINER_LENGTH,
- ma.mime_string as MIME_STRING_A,
- mb.mime_string as MIME_STRING_B,
- ca.num_unique_tokens as NUM_UNIQUE_TOKENS_A,
- cb.num_unique_tokens as NUM_UNIQUE_TOKENS_B,
- ca.num_tokens as NUM_TOKENS_A,
- cb.num_tokens as NUM_TOKENS_B,
- ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
- ca.num_common_tokens as NUM_COMMON_TOKENS_A,
- cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
- cb.num_common_tokens as NUM_COMMON_TOKENS_B,
- coalesce(cb.num_common_tokens,0)-
- coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
- ca.top_n_tokens as TOP_N_TOKENS_A,
- cb.top_n_tokens as TOP_N_TOKENS_B,
- ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
- cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
- top_10_unique_token_diffs_a,
- top_10_unique_token_diffs_b,
- top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap,
- ref_ea.parse_exception_description as EXCEPTION_A,
- ref_eb.parse_exception_description as EXCEPTION_B
- from content_comparisons cc
- join contents_a ca on ca.id=cc.id
- left join contents_b cb on cb.id=cc.id
- join profiles_a pa on pa.id = cc.id
- join profiles_b pb on pb.id=cc.id
- join containers c on c.container_id=pa.container_id
- join mimes ma on ma.mime_id=pa.mime_id
- join mimes mb on mb.mime_id=pb.mime_id
- left join exceptions_a ea on ea.id=cc.id
- left join exceptions_b eb on eb.id=cc.id
- left join ref_parse_exception_types ref_ea on ref_ea.parse_exception_id=ea.parse_exception_id
- left join ref_parse_exception_types ref_eb on ref_eb.parse_exception_id=eb.parse_exception_id
- where (overlap < 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) >30)
- and (ea.parse_exception_id is null or
- ea.parse_exception_id <> 2)
- and (eb.parse_exception_id is null or
- eb.parse_exception_id <> 2)
- order by ma.mime_string, overlap asc
- limit 100000
- </sql>
- </report>
- <report reportName="contentDiffsNoExceptions"
- reportFilename="content/content_diffs_no_exceptions.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path,
- c.length as CONTAINER_LENGTH,
- ma.mime_string as MIME_STRING_A,
- mb.mime_string as MIME_STRING_B,
- ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A,
- cb.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_B,
- ca.NUM_TOKENS as NUM_TOKENS_A,
- cb.NUM_TOKENS as NUM_TOKENS_B,
- ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
- ca.num_common_tokens as NUM_COMMON_TOKENS_A,
- cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
- cb.num_common_tokens as NUM_COMMON_TOKENS_B,
- coalesce(cb.num_common_tokens,0)-
- coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
- ca.top_n_tokens as TOP_N_TOKENS_A,
- cb.top_n_tokens as TOP_N_TOKENS_B,
- ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
- cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
- top_10_unique_token_diffs_a,
- top_10_unique_token_diffs_b,
- top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap
- from content_comparisons cc
- join contents_a ca on ca.id=cc.id
- join contents_b cb on cb.id=cc.id
- join profiles_a pa on pa.id = cc.id
- join profiles_b pb on pb.id=cc.id
- join containers c on c.container_id=pa.container_id
- join mimes ma on ma.mime_id=pa.mime_id
- join mimes mb on mb.mime_id=pb.mime_id
- left join exceptions_a ea on ea.id=cc.id
- left join exceptions_b eb on eb.id=cc.id
- where (overlap < 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) >30)
- and (ea.parse_exception_id is null)
- and (eb.parse_exception_id is null)
- order by ma.mime_string, overlap asc
- limit 100000
- </sql>
- </report>
-
- <report reportName="CommonTokenComparisonsByMimeType"
- reportFilename="content/common_token_comparisons_by_mime.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select ma.mime_string as MIME_STRING_A, mb.mime_string as MIME_STRING_B,
- num_tokens_a, num_tokens_b,
- num_alphabetic_tokens_a, num_alphabetic_tokens_b,
- num_common_tokens_a, num_common_tokens_b,
- coalesce(num_common_tokens_b, 0)-coalesce(num_common_tokens_a, 0) as change_in_common_tokens_b
- from token_counts_compared tcc
- join mimes ma on tcc.mime_id_a = ma.mime_id
- join mimes mb on tcc.mime_id_b = mb.mime_id
- order by change_in_common_tokens_b desc
- </sql>
- </report>
- <report reportName="PageCountDiffs"
- reportFilename="content/page_count_diffs.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path,
- c.length as CONTAINER_LENGTH,
- ma.mime_string as MIME_STRING_A,
- mb.mime_string as MIME_STRING_B,
- pa.num_pages as NUM_PAGES_A,
- pb.num_pages as NUM_PAGES_B,
- (pb.num_pages-pa.num_pages) as DIFF_NUM_PAGES_IN_B
- from profiles_a pa
- join profiles_b pb on pa.id = pb.id
- join containers c on pa.container_id=c.container_id
- join mimes ma on ma.mime_id=pa.mime_id
- join mimes mb on mb.mime_id=pb.mime_id
- where pa.num_pages is not null
- and pb.num_pages is not null
- and pa.num_pages <> pb.num_pages
- order by DIFF_NUM_PAGES_IN_B asc
- limit 10000;
- </sql>
- </report>
-
-
- <report reportName="ExceptionComparisonsByMimeType"
- reportFilename="exceptions/exceptions_compared_by_mime_type.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select ma.mime_string as mime_string_a, mb.mime_string as mime_string_b,
- total, exc_cnt_a,
- exc_cnt_b,
- exc_prcnt_a,
- exc_prcnt_b, notes
-
- from exceptions_compared e
- join mimes ma on ma.mime_id=e.mime_id_a
- join mimes mb on mb.mime_id=e.mime_id_b
- order by (exc_prcnt_b-exc_prcnt_a) desc, total desc;
- </sql>
- </report>
- <!-- <report reportName="MD5 Duplicate Counts A"
- reportFilename="md5/md5_duplicate_counts_A.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select md5, count(1) cnt
- from profiles_a
- group by md5
- having cnt > 2
- order by cnt desc
- </sql>
- </report>
-
- <report reportName="MD5 Duplicate Counts B"
- reportFilename="md5/md5_duplicate_counts_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select md5, count(1) cnt
- from profiles_b
- group by md5
- having cnt > 2
- order by cnt desc
- </sql>
- </report>
-
- <report reportName="MD5 Duplicates A"
- reportFilename="md5/md5_duplicates_A.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5
- from md5_multiples_tmp_a t
- join profiles_a p on p.md5 = t.md5
- join containers c on p.container_id = c.container_id
- join contents_a cb on p.id=cb.id
- order by t.cnt desc
- </sql>
- </report>
-
- <report reportName="MD5 Duplicates B"
- reportFilename="md5/md5_duplicates_B.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5
- from md5_multiples_tmp_b t
- join profiles_b p on p.md5 = t.md5
- join containers c on p.container_id = c.container_id
- join contents_b cb on p.id=cb.id
- order by t.cnt desc
- </sql>
- </report>
- -->
-
- <report reportName="Attachment Diffs no Exceptions"
- reportFilename="attachments/attachment_diffs_no_exceptions.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path,
- c.length as CONTAINER_LENGTH,
- ma.mime_string as MIME_STRING_A,
- mb.mime_string as MIME_STRING_B,
- pa.num_attachments as NUM_ATTACHMENTS_A,
- pb.num_attachments as NUM_ATTACHMENTS_B,
- pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B
- from profiles_a pa
- join profiles_b pb on pa.id= pb.id
- join containers c on pa.container_id=c.container_id
- join mimes ma on pa.mime_id=ma.mime_id
- join mimes mb on pb.mime_id=mb.mime_id
- left join exceptions_a ea on ea.id=pa.id
- left join exceptions_b eb on eb.id=pb.id
- where pa.is_embedded=false and
- ea.parse_exception_id is null and
- eb.parse_exception_id is null
- and pa.num_attachments <> pb.num_attachments
- order by ma.mime_string, pb.num_attachments-pa.num_attachments
- limit 100000;
- </sql>
- </report>
-
- <report reportName="Attachment Diffs with exceptions"
- reportFilename="attachments/attachment_diffs_with_exceptions.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path,
- c.length as CONTAINER_LENGTH,
- ma.mime_string as MIME_STRING_A,
- mb.mime_string as MIME_STRING_B,
- pa.num_attachments as NUM_ATTACHMENTS_A,
- pb.num_attachments as NUM_ATTACHMENTS_B,
- pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B,
- refea.parse_exception_description as PARSE_EXCEPTION_A,
- refeb.parse_exception_description as PARSE_EXCEPTION_B
- from profiles_a pa
- join profiles_b pb on pa.id= pb.id
- join containers c on pa.container_id=c.container_id
- join mimes ma on pa.mime_id=ma.mime_id
- join mimes mb on pb.mime_id=mb.mime_id
- left join exceptions_a ea on ea.id=pa.id
- left join exceptions_b eb on eb.id=pb.id
- left join ref_parse_exception_types refea on ea.parse_exception_id=refea.parse_exception_id
- left join ref_parse_exception_types refeb on eb.parse_exception_id=refeb.parse_exception_id
- where pa.is_embedded=false
- and pa.num_attachments <> pb.num_attachments
- order by ma.mime_string, pb.num_attachments-pa.num_attachments
- limit 100000;
- </sql>
- </report>
-
- <report reportName="Files missing in B by Mime"
- reportFilename="attachments/all_files_missing_in_B_by_mime.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) as cnt
- from profiles_a pa
- left join profiles_b pb on pa.id=pb.id
- join mimes m on pa.mime_id=m.mime_id
- where pb.id is null
- group by mime_string
- order by cnt desc
- </sql>
- </report>
- <report reportName="Container files missing in B by Mime"
- reportFilename="attachments/container_files_missing_in_B_by_mime.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) as cnt
- from profiles_a pa
- left join profiles_b pb on pa.id=pb.id
- join mimes m on pa.mime_id=m.mime_id
- where pb.id is null and pa.is_embedded=false
- group by mime_string
- order by cnt desc
- </sql>
- </report>
- <report reportName="Embedded files missing in B by Mime"
- reportFilename="attachments/embedded_files_missing_in_B_by_mime.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) as cnt
- from profiles_a pa
- left join profiles_b pb on pa.id=pb.id
- join mimes m on pa.mime_id=m.mime_id
- where pb.id is null and pa.is_embedded=true
- group by mime_string
- order by cnt desc
- </sql>
- </report>
- <report reportName="All files missing in A by Mime"
- reportFilename="attachments/all_files_missing_in_A_by_mime.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) as cnt
- from profiles_b pb
- left join profiles_a pa on pb.id=pa.id
- join mimes m on pb.mime_id=m.mime_id
- where pa.id is null
- group by mime_string
- order by cnt desc
- </sql>
- </report>
- <report reportName="Container files missing in A by Mime"
- reportFilename="attachments/container_files_missing_in_A_by_mime.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) as cnt
- from profiles_b pb
- left join profiles_a pa on pb.id=pa.id
- join mimes m on pb.mime_id=m.mime_id
- where pa.id is null and pb.is_embedded=false
- group by mime_string
- order by cnt desc
- </sql>
- </report>
- <report reportName="Embedded files missing in A by Mime"
- reportFilename="attachments/embedded_files_missing_in_A_by_mime.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select mime_string, count(1) as cnt
- from profiles_b pb
- left join profiles_a pa on pb.id=pa.id
- join mimes m on pb.mime_id=m.mime_id
- where pa.id is null and pb.is_embedded=true
- group by mime_string
- order by cnt desc
- </sql>
- </report>
- <!-- metadata values -->
- <report reportName="Metadata Value Diffs"
- reportFilename="metadata/metadata_value_count_diffs.xlsx"
- format="xlsx"
- includeSql="true">
-
- <sql>
- select file_path,
- ma.mime_string as mime_string_a,
- mb.mime_string as mime_string_b,
- pa.num_metadata_values as num_metadata_values_a,
- pb.num_metadata_values as num_metadata_values_b,
- ea.parse_exception_id as parse_ex_id_a,
- eb.parse_exception_id as parse_ex_id_b
- from profiles_a pa
- join profiles_b pb on pa.id= pb.id
- join containers c on pa.container_id=c.container_id
- join mimes ma on pa.mime_id=ma.mime_id
- join mimes mb on pb.mime_id=mb.mime_id
- left join exceptions_a ea on ea.id=pa.id
- left join exceptions_b eb on eb.id=pb.id
- where
- ea.parse_exception_id is null and
- eb.parse_exception_id is null
- and pa.num_metadata_values <> pb.num_metadata_values
- order by ma.mime_string,
- pb.num_metadata_values-pa.num_metadata_values
- limit 100000
- </sql>
- </report>
- <report reportName="Tag Count Diffs By Mime"
- reportFilename="tags/tag_count_diffs_by_mime.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select ma.mime_string as mime_string_a,
- mb.mime_string as mime_string_b,
- tags_a_a,
- tags_a_b,
- tags_b_a,
- tags_b_b,
- tags_div_a,
- tags_div_b,
- tags_i_a,
- tags_i_b,
- tags_li_a,
- tags_li_b,
- tags_ol_a,
- tags_ol_b,
- tags_p_a,
- tags_p_b,
- tags_table_a,
- tags_table_b,
- tags_td_a,
- tags_td_b,
- tags_title_a,
- tags_title_b,
- tags_tr_a,
- tags_tr_b,
- tags_u_a,
- tags_u_b,
- tags_ul_a,
- tags_ul_b
- from
- tags_by_mime tbm
- join mimes ma on tbm.mime_id_a=ma.mime_id
- join mimes mb on tbm.mime_id_b=mb.mime_id
- limit 100000
- </sql>
-
- </report>
- <report reportName="Tag Exceptions By Mime"
- reportFilename="tags/tag_exceptions_by_mime.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select ma.mime_string as mime_string_a,
- mb.mime_string as mime_string_b,
- tag_exceptions_a,
- tag_exceptions_b,
- (tag_exceptions_b-tag_exceptions_a) as diff_tag_exceptions_in_b
- from tag_exceptions_by_mime tebm
- join mimes ma on tebm.mime_id_a=ma.mime_id
- join mimes mb on tebm.mime_id_b=mb.mime_id
- order by diff_tag_exceptions_in_b desc
- </sql>
- </report>
- <report reportName="Tag Exceptions Details A"
- reportFilename="tags/tag_exceptions_details_a.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select c.file_path,pa.file_name,mime_string,is_embedded from
- tags_a ta
- join profiles_a pa on ta.id=pa.id
- join containers c on pa.container_id=c.container_id
- join mimes m on pa.mime_id=m.mime_id
- where ta.tags_parse_exception=true
- order by m.mime_string
- limit 20000
- </sql>
- </report>
- <report reportName="Tag Exceptions Details B"
- reportFilename="tags/tag_exceptions_details_b.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select c.file_path,pb.file_name,mime_string,is_embedded from
- tags_b tb
- join profiles_b pb on tb.id=pb.id
- join containers c on pb.container_id=c.container_id
- join mimes m on pb.mime_id=m.mime_id
- where tb.tags_parse_exception=true
- order by m.mime_string
- limit 20000
- </sql>
- </report>
-
- <report reportName="Parse Time (Millis) Compared"
- reportFilename="parse_times/parse_time_millis_by_mime_compared.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select ma.mime_string as MIME_STRING_A,
- mb.mime_string as MIME_STRING_B,
- total_a as TOTAL_MILLIS_A, total_b as TOTAL_MILLIS_B,
- prcnt_increase as PERCENT_INCREASE
- from parse_time_compared ptc
- join mimes ma on ptc.mime_id_a=ma.mime_id
- join mimes mb on ptc.mime_id_b=mb.mime_id
- where TOTAL_A > 1000 AND TOTAL_B > 1000 -- only show comparisons if > a second
- order by prcnt_increase desc
- </sql>
- </report>
- <report reportName="Parse Time (Millis) Details"
- reportFilename="parse_times/parse_time_millis_details.xlsx"
- format="xlsx"
- includeSql="true">
- <sql>
- select file_path, c.length as CONTAINTER_LENGTH,
- ma.mime_string as MIME_STRING_A,
- mb.mime_string as MIME_STRING_B,
- pa.elapsed_time_millis as TOTAL_MILLIS_A,
- pb.elapsed_time_millis as TOTAL_MILLIS_B,
- (pb.elapsed_time_millis-pa.elapsed_time_millis) as DIFF_MILLIS
- from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- join mimes ma on ma.mime_id=pa.mime_id
- join mimes mb on mb.mime_id=pb.mime_id
- join containers c on pa.container_id=c.container_id
- order by DIFF_MILLIS desc
- limit 20000;
- </sql>
- </report>
- <after>
- <sql>drop table if exists md5_multiples_tmp_a</sql>
- <sql>drop table if exists md5_multiples_tmp_b</sql>
- </after>
-</reports>
\ No newline at end of file
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
deleted file mode 100644
index e23ec5e..0000000
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ /dev/null
@@ -1,1753 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-<reports>
-
-
- <before>
-
- <sql>drop table if exists md5_multiples_tmp_a</sql>
- <sql>create table md5_multiples_tmp_a (MD5 char(32), cnt int)
- as
- select md5, count(1) cnt
- from profiles_a
- where md5 is not null
- group by md5
- having cnt > 1
- order by cnt desc
- </sql>
-
- <sql>drop table if exists md5_multiples_tmp_b</sql>
- <sql>create table md5_multiples_tmp_b (MD5 char(32), cnt int)
- as
- select md5, count(1) cnt
- from profiles_b
- where md5 is not null
- group by md5
- having cnt > 1
- order by cnt desc
- </sql>
- <!-- build mime indexes -->
-
- <sql>create index if not exists pa_m_idx
- on profiles_a (mime_id);
- </sql>
-
- <sql>
- create index if not exists pb_m_idx
- on profiles_b (mime_id);
- </sql>
-
- <!-- build exceptions comparison table -->
- <sql>drop table if exists exceptions_compared</sql>
- <sql>
- create table exceptions_compared (
- mime_id_a integer,
- mime_id_b integer,
- total integer,
- exc_cnt_a integer,
- exc_cnt_b integer,
- exc_prcnt_a float,
- exc_prcnt_b float,
- notes varchar(12)
- );
- </sql>
- <sql>
- insert into exceptions_compared (
- select ma.mime_id, mb.mime_id, count(1) as total, 0, 0, 0.0, 0.0, ''
- from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- join mimes ma on pa.mime_id = ma.mime_id
- join mimes mb on pb.mime_id = mb.mime_id
- group by ma.mime_id, mb.mime_id
- order by total desc );
- </sql>
-
- <sql>
- update exceptions_compared ec set
- exc_cnt_a = (
- select count(1) as cnt
- from exceptions_a ea
- join profiles_a pa on ea.id=pa.id
- join profiles_b pb on pb.id=pa.id
- join mimes ma on pa.mime_id=ma.mime_id
- join mimes mb on pb.mime_id=mb.mime_id
- where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
- group by ma.mime_id, mb.mime_id);
- </sql>
- <sql>
- update exceptions_compared ec set
- exc_cnt_b = (
- select count(1) as cnt
- from exceptions_b eb
- join profiles_b pb on eb.id=pb.id
- join profiles_a pa on pa.id=pb.id
- join mimes ma on pa.mime_id=ma.mime_id
- join mimes mb on pb.mime_id=mb.mime_id
- where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
- group by mb.mime_id, ma.mime_id);
- </sql>
- <sql>
- update exceptions_compared
- set exc_prcnt_a = cast(exc_cnt_a as decimal)/cast(total as decimal)
- where total > 0;
- </sql>
- <sql>
- update exceptions_compared
- set exc_prcnt_b = cast(exc_cnt_b as decimal)/cast(total as decimal)
- where total > 0;
- </sql>
-
- <sql>
- update exceptions_compared
- set notes = 'YAY!'
- where total > 100 and (exc_prcnt_a-exc_prcnt_b) > 0.10;
- </sql>
- <sql>
- update exceptions_compared
- set notes = 'YIKES!'
- where total > 100 and (exc_prcnt_b-exc_prcnt_a) > 0.10;
- </sql>
-
- <!-- build tmp common words table -->
- <sql>drop table if exists token_counts_compared</sql>
- <sql>
- create table token_counts_compared
- (mime_id_a integer,
- mime_id_b integer,
- num_tokens_a long default 0,
- num_tokens_b long default 0,
- num_alphabetic_tokens_a long default 0,
- num_alphabetic_tokens_b long default 0,
- num_common_tokens_a long default 0,
- num_common_tokens_b long default 0
- );
- </sql>
- <sql>
- insert into token_counts_compared (mime_id_a, mime_id_b)
- select ma.mime_id, mb.mime_id
- from profiles_a a
- join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_id=a.mime_id
- join mimes mb on mb.mime_id=b.mime_id
- group by ma.mime_id, mb.mime_id
-
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_tokens_a=(
- select sum(num_tokens) as cnt from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- join contents_a c on c.id = pa.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_tokens_b=(
- select sum(num_tokens) as cnt from profiles_b pb
- join profiles_a pa on pa.id=pb.id
- join contents_b c on c.id = pb.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_alphabetic_tokens_a=(
- select sum(num_alphabetic_tokens) as cnt from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- join contents_a c on c.id = pa.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_alphabetic_tokens_b=(
- select sum(num_alphabetic_tokens) as cnt from profiles_b pb
- join profiles_a pa on pb.id=pa.id
- join contents_b c on c.id = pb.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_common_tokens_a=(
- select sum(num_common_tokens) as cnt from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- join contents_a c on c.id = pa.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>
- update token_counts_compared tcc set num_common_tokens_b=(
- select sum(num_common_tokens) as cnt from profiles_b pb
- join profiles_a pa on pa.id=pb.id
- join contents_b c on c.id = pb.id
- where pb.mime_id= tcc.mime_id_b
- and pa.mime_id=tcc.mime_id_a
- group by mime_id_a, mime_id_b
- );
- </sql>
-
- <sql>drop table if exists tags_by_mime</sql>
- <sql>create table tags_by_mime (
- mime_id_a integer,
- mime_id_b integer,
- tags_a_a integer,
- tags_b_a integer,
- tags_div_a integer,
- tags_i_a integer,
- tags_img_a integer,
- tags_li_a integer,
- tags_ol_a integer,
- tags_p_a integer,
- tags_table_a integer,
- tags_td_a integer,
- tags_title_a integer,
- tags_tr_a integer,
- tags_u_a integer,
- tags_ul_a integer,
- tags_a_b integer,
- tags_b_b integer,
- tags_div_b integer,
- tags_i_b integer,
- tags_img_b integer,
- tags_li_b integer,
- tags_ol_b integer,
- tags_p_b integer,
- tags_table_b integer,
- tags_td_b integer,
- tags_title_b integer,
- tags_tr_b integer,
- tags_u_b integer,
- tags_ul_b integer
- );
- </sql>
- <sql>
- insert into tags_by_mime (mime_id_a, mime_id_b)
- select ma.mime_id, mb.mime_id
- from profiles_a a
- join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_id=a.mime_id
- join mimes mb on mb.mime_id=b.mime_id
- group by ma.mime_id, mb.mime_id
- </sql>
- <sql>
- update tags_by_mime tbm set tags_a_a=(
- select sum(ta.tags_a) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_b_a=(
- select sum(ta.tags_b) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_div_a=(
- select sum(ta.tags_div) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_i_a=(
- select sum(ta.tags_i) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_img_a=(
- select sum(ta.tags_img) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_li_a=(
- select sum(ta.tags_li) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_ol_a=(
- select sum(ta.tags_ol) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_p_a=(
- select sum(ta.tags_p) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_table_a=(
- select sum(ta.tags_table) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_td_a=(
- select sum(ta.tags_td) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_title_a=(
- select sum(ta.tags_title) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_tr_a=(
- select sum(ta.tags_tr) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_u_a=(
- select sum(ta.tags_u) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_ul_a=(
- select sum(ta.tags_ul) as cnt from tags_a ta
- join tags_b tb on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <!-- now update tags_b counts -->
- <sql>
- update tags_by_mime tbm set tags_a_b=(
- select sum(tb.tags_a) as cnt from tags_b tb
- join tags_a ta on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_b_b=(
- select sum(tb.tags_b) as cnt from tags_b tb
- join tags_a ta on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_div_b=(
- select sum(tb.tags_div) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_i_b=(
- select sum(tb.tags_i) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_img_b=(
- select sum(tb.tags_img) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_li_b=(
- select sum(tb.tags_li) as cnt from tags_b tb
- join tags_a ta on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_ol_b=(
- select sum(tb.tags_ol) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_p_b=(
- select sum(tb.tags_p) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_table_b=(
- select sum(tb.tags_table) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_td_b=(
- select sum(tb.tags_td) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_title_b=(
- select sum(tb.tags_title) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_tr_b=(
- select sum(tb.tags_tr) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_u_b=(
- select sum(tb.tags_u) as cnt from tags_b tb
- join tags_a ta on tb.id=ta.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tags_by_mime tbm set tags_ul_b=(
- select sum(tb.tags_ul) as cnt from tags_b tb
- join tags_a ta on ta.id=tb.id
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tbm.mime_id_b
- and pa.mime_id=tbm.mime_id_a
- and ta.tags_parse_exception=false
- and tb.tags_parse_exception=false
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>drop table if exists tag_exceptions_by_mime</sql>
- <sql>create table tag_exceptions_by_mime (
- mime_id_a integer,
- mime_id_b integer,
- tag_exceptions_a integer,
- tag_exceptions_b integer)
- </sql>
- <sql>
- insert into tag_exceptions_by_mime (mime_id_a, mime_id_b,
- tag_exceptions_a, tag_exceptions_b)
- select ma.mime_id, mb.mime_id,0,0
- from profiles_a a
- join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_id=a.mime_id
- join mimes mb on mb.mime_id=b.mime_id
- group by ma.mime_id, mb.mime_id
- </sql>
- <sql>
- update tag_exceptions_by_mime tebm set tag_exceptions_a=(
- select count(1) as cnt from tags_a ta
- join profiles_a pa on pa.id=ta.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tebm.mime_id_b
- and pa.mime_id=tebm.mime_id_a
- and ta.tags_parse_exception=true
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- update tag_exceptions_by_mime tebm set tag_exceptions_b=(
- select count(1) as cnt from tags_b tb
- join profiles_a pa on pa.id=tb.id
- join profiles_b pb on pa.id=pb.id
- where pb.mime_id= tebm.mime_id_b
- and pa.mime_id=tebm.mime_id_a
- and tb.tags_parse_exception=true
- group by mime_id_a, mime_id_b
- );
- </sql>
- <sql>
- drop table if exists parse_time_compared;
- </sql>
- <sql>
- create table parse_time_compared (
- mime_id_a integer,
- mime_id_b integer,
- total_a bigint,
- total_b bigint,
- prcnt_increase double
- );
- </sql>
- <sql>
- insert into parse_time_compared (mime_id_a, mime_id_b,
- total_a, total_b, prcnt_increase)
- select ma.mime_id, mb.mime_id,0,0,0.0
- from profiles_a a
- join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_id=a.mime_id
- join mimes mb on mb.mime_id=b.mime_id
- group by ma.mime_id, mb.mime_id
- </sql>
- <sql>
- update parse_time_compared ptc set total_a=(
- select sum(pa.elapsed_time_millis) as total_a from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- where pa.mime_id= ptc.mime_id_a
- and pb.mime_id=ptc.mime_id_b
- group by mime_id_a, mime_id_b)
- </sql>
- <sql>
- update parse_time_compared ptc set total_b=(
- select sum(pb.elapsed_time_millis) as total_b from profiles_b pb
- join profiles_a pa on pa.id=pb.id
- where pa.mime_id= ptc.mime_id_a
... 6754 lines suppressed ...