You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/14 14:57:48 UTC
[tika] branch master updated: add optional postgres dialect for
comparison reports; improve initialization of ref tables in tika-eval
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new e479995 add optional postgres dialect for comparison reports; improve initialization of ref tables in tika-eval
e479995 is described below
commit e47999588e65b28408182092f46f5ff32bfc0aa8
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 14 10:57:25 2020 -0400
add optional postgres dialect for comparison reports;
improve initialization of ref tables in tika-eval
---
.../tika/eval/batch/EvalConsumerBuilder.java | 37 +++++++++-------
...rison-reports.xml => comparison-reports-pg.xml} | 49 +++++++++++-----------
.../src/main/resources/comparison-reports.xml | 3 +-
3 files changed, 50 insertions(+), 39 deletions(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
index 694b05e..b50d4a1 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
@@ -19,12 +19,11 @@ package org.apache.tika.eval.batch;
import java.io.IOException;
import java.sql.Connection;
+import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
-import java.util.Locale;
import java.util.Map;
-import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
@@ -41,9 +40,13 @@ import org.apache.tika.eval.io.ExtractReader;
import org.apache.tika.eval.io.ExtractReaderException;
import org.apache.tika.eval.io.IDBWriter;
import org.apache.tika.util.PropsUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public abstract class EvalConsumerBuilder {
- private AtomicInteger count = new AtomicInteger(0);
+
+ private static final Logger LOG = LoggerFactory.getLogger(EvalConsumerBuilder.class);
+
protected ArrayBlockingQueue<FileResource> queue;
Map<String, String> localAttrs;
JDBCUtil dbUtil;
@@ -75,7 +78,7 @@ public abstract class EvalConsumerBuilder {
//step 3. create mime buffer
this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), TikaConfig.getDefaultConfig());
- //step 4. populate the reference tabless
+ //step 4. populate the reference tables
populateRefTables();
return mimeBuffer;
@@ -100,21 +103,27 @@ public abstract class EvalConsumerBuilder {
protected abstract void addErrorLogTablePairs(DBConsumersManager manager);
public void populateRefTables() throws IOException, SQLException {
- //test for one ref table. If it exists, don't populate ref tables
- //TODO: test one at a time
- boolean tableExists = false;
+ boolean refTablesPopulated = true;
try (Connection connection = dbUtil.getConnection()) {
- Set<String> tables = dbUtil.getTables(connection);
- if (tables.contains(
- AbstractProfiler.REF_PARSE_ERROR_TYPES.getName().toLowerCase(Locale.US)
- )) {
- tableExists = true;
+ for (TableInfo tableInfo : getRefTableInfos()) {
+ int rows = 0;
+ try (ResultSet rs = connection.createStatement().executeQuery("select * from "+
+ tableInfo.getName())) {
+ while (rs.next()) {
+ rows++;
+ }
+ }
+ if (rows == 0) {
+ refTablesPopulated = false;
+ break;
+ }
+
}
} catch (SQLException e) {
//swallow
}
-
- if (tableExists) {
+ if (refTablesPopulated) {
+ LOG.info("ref tables are already populated");
return;
}
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports-pg.xml
similarity index 97%
copy from tika-eval/src/main/resources/comparison-reports.xml
copy to tika-eval/src/main/resources/comparison-reports-pg.xml
index e84454a..5bcf88e 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports-pg.xml
@@ -25,24 +25,24 @@
<before>
<sql>drop table if exists md5_multiples_tmp_a</sql>
- <sql>create table md5_multiples_tmp_a (MD5 char(32), cnt int)
+ <sql>create table md5_multiples_tmp_a (MD5, cnt)
as
- select md5, count(1) cnt
+ select md5, count(1) as cnt
from profiles_a
where md5 is not null
group by md5
- having cnt > 1
+ having count(1) > 1
order by cnt desc
</sql>
<sql>drop table if exists md5_multiples_tmp_b</sql>
- <sql>create table md5_multiples_tmp_b (MD5 char(32), cnt int)
+ <sql>create table md5_multiples_tmp_b (MD5, cnt)
as
select md5, count(1) cnt
from profiles_b
where md5 is not null
group by md5
- having cnt > 1
+ having count(1) > 1
order by cnt desc
</sql>
<!-- build mime indexes -->
@@ -133,12 +133,12 @@
create table token_counts_compared
(mime_id_a integer,
mime_id_b integer,
- num_tokens_a long default 0,
- num_tokens_b long default 0,
- num_alphabetic_tokens_a long default 0,
- num_alphabetic_tokens_b long default 0,
- num_common_tokens_a long default 0,
- num_common_tokens_b long default 0
+ num_tokens_a bigint default 0,
+ num_tokens_b bigint default 0,
+ num_alphabetic_tokens_a bigint default 0,
+ num_alphabetic_tokens_b bigint default 0,
+ num_common_tokens_a bigint default 0,
+ num_common_tokens_b bigint default 0
);
</sql>
<sql>
@@ -674,7 +674,7 @@
mime_id_b integer,
total_a bigint,
total_b bigint,
- prcnt_increase double
+ prcnt_increase double precision
);
</sql>
<sql>
@@ -704,7 +704,7 @@
group by mime_id_a, mime_id_b)
</sql>
<sql>
- update parse_time_compared ptc set prcnt_increase=(
+ update parse_time_compared ptc set prcnt_increase=(100.0 *
cast(total_b as decimal)/cast(total_a as decimal))
where total_a > 0;
</sql>
@@ -913,7 +913,7 @@
join mimes m on m.mime_id=p.mime_id
join ref_parse_exception_types r on
r.parse_exception_id=e.parse_exception_id
- group by p.mime_id, parse_exception_description
+ group by m.mime_string, parse_exception_description
order by MIME_TYPE, EXCEPTION_TYPE
</sql>
</report>
@@ -932,7 +932,7 @@
join mimes m on m.mime_id=p.mime_id
join ref_parse_exception_types r on
r.parse_exception_id=e.parse_exception_id
- group by p.mime_id, parse_exception_description
+ group by m.mime_string, parse_exception_description
order by MIME_TYPE, EXCEPTION_TYPE
</sql>
</report>
@@ -963,7 +963,7 @@
left join exceptions_a ea on ca.id = ea.id
where eb.orig_stack_trace is not null
and ea.orig_stack_trace is null
- order by ca.num_common_tokens - ifnull(cb.num_common_tokens,0) desc
+ order by ca.num_common_tokens - coalesce(cb.num_common_tokens,0) desc
</sql>
</report>
@@ -1004,7 +1004,7 @@
from exceptions_a ea
left join exceptions_b eb on ea.id = eb.id
join profiles_a pa on pa.id=ea.id
- join profiles_b pb on pb.id=pa.id //this ensures that files were actually processed in both runs
+ join profiles_b pb on pb.id=pa.id --this ensures that files were actually processed in both runs
join containers c on pa.container_id=c.container_id
join mimes ma on ma.mime_id=pa.mime_id
join mimes mb on mb.mime_id=pb.mime_id
@@ -1211,7 +1211,7 @@
from exceptions_a e
join ref_parse_exception_types t on
t.parse_exception_id=e.parse_exception_id
- group by e.parse_exception_id
+ group by t.parse_exception_description
</sql>
</report>
<report reportName="parseExceptionTypesB"
@@ -1223,7 +1223,7 @@
from exceptions_b e
join ref_parse_exception_types t on
t.parse_exception_id=e.parse_exception_id
- group by e.parse_exception_id
+ group by t.parse_exception_description
</sql>
</report>
@@ -1245,8 +1245,8 @@
ca.num_common_tokens as NUM_COMMON_TOKENS_A,
cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
cb.num_common_tokens as NUM_COMMON_TOKENS_B,
- ifnull(cb.num_common_tokens,0)-
- ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
+ coalesce(cb.num_common_tokens,0)-
+ coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
ca.top_n_tokens as TOP_N_TOKENS_A,
cb.top_n_tokens as TOP_N_TOKENS_B,
ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
@@ -1295,8 +1295,8 @@
ca.num_common_tokens as NUM_COMMON_TOKENS_A,
cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
cb.num_common_tokens as NUM_COMMON_TOKENS_B,
- ifnull(cb.num_common_tokens,0)-
- ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
+ coalesce(cb.num_common_tokens,0)-
+ coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
ca.top_n_tokens as TOP_N_TOKENS_A,
cb.top_n_tokens as TOP_N_TOKENS_B,
ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
@@ -1332,7 +1332,7 @@
num_tokens_a, num_tokens_b,
num_alphabetic_tokens_a, num_alphabetic_tokens_b,
num_common_tokens_a, num_common_tokens_b,
- ifnull(num_common_tokens_b, 0)-ifnull(num_common_tokens_a, 0) as change_in_common_tokens_b
+ coalesce(num_common_tokens_b, 0)-coalesce(num_common_tokens_a, 0) as change_in_common_tokens_b
from token_counts_compared tcc
join mimes ma on tcc.mime_id_a = ma.mime_id
join mimes mb on tcc.mime_id_b = mb.mime_id
@@ -1722,6 +1722,7 @@
from parse_time_compared ptc
join mimes ma on ptc.mime_id_a=ma.mime_id
join mimes mb on ptc.mime_id_b=mb.mime_id
+ where TOTAL_A > 1000 AND TOTAL_B > 1000 -- only show comparisons if > a second
order by prcnt_increase desc
</sql>
</report>
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index e84454a..e23ec5e 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -704,7 +704,7 @@
group by mime_id_a, mime_id_b)
</sql>
<sql>
- update parse_time_compared ptc set prcnt_increase=(
+ update parse_time_compared ptc set prcnt_increase=(100.0 *
cast(total_b as decimal)/cast(total_a as decimal))
where total_a > 0;
</sql>
@@ -1722,6 +1722,7 @@
from parse_time_compared ptc
join mimes ma on ptc.mime_id_a=ma.mime_id
join mimes mb on ptc.mime_id_b=mb.mime_id
+ where TOTAL_A > 1000 AND TOTAL_B > 1000 -- only show comparisons if > a second
order by prcnt_increase desc
</sql>
</report>