You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/09/25 16:21:33 UTC
[tika] branch master updated: TIKA-2736 -- improve reports for
comparisons
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new e8bcfce TIKA-2736 -- improve reports for comparisons
e8bcfce is described below
commit e8bcfced7acd0f3f3f5d253c529fe4f96277b2d6
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue Sep 25 12:21:20 2018 -0400
TIKA-2736 -- improve reports for comparisons
---
.../src/main/resources/comparison-reports.xml | 110 +++++++++++----------
1 file changed, 56 insertions(+), 54 deletions(-)
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index 06b5c28..bba7f01 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -59,73 +59,72 @@
<!-- build exceptions comparison table -->
<sql>drop table if exists exceptions_compared</sql>
<sql>
- create table exceptions_compared
- (mime_id_a integer, mime_id_b integer,
- exceptions_a integer default 0,
- total_a integer default 0,
- percent_exceptions_a double default 0.0,
- exceptions_b integer default 0,
- total_b integer default 0,
- percent_exceptions_b double default 0.0);
+ create table exceptions_compared (
+ mime_id_a integer,
+ mime_id_b integer,
+ total integer,
+ exc_cnt_a integer,
+ exc_cnt_b integer,
+ exc_prcnt_a float,
+ exc_prcnt_b float,
+ notes varchar(12)
+ );
</sql>
<sql>
- insert into exceptions_compared (mime_id_a, mime_id_b)
- select ma.mime_id, mb.mime_id
- from profiles_a a
- join profiles_b b on a.id=b.id
- join mimes ma on ma.mime_id=a.mime_id
- join mimes mb on mb.mime_id=b.mime_id
+ insert into exceptions_compared (
+ select ma.mime_id, mb.mime_id, count(1) as total, 0, 0, 0.0, 0.0, ''
+ from profiles_a pa
+ join profiles_b pb on pa.id=pb.id
+ join mimes ma on pa.mime_id = ma.mime_id
+ join mimes mb on pb.mime_id = mb.mime_id
group by ma.mime_id, mb.mime_id
+ order by total desc );
</sql>
<sql>
- update exceptions_compared ec set total_a=(
- select count(1) as cnt from profiles_a pa
- join profiles_b pb on pa.id=pb.id
- where pa.mime_id= ec.mime_id_a
- and pb.mime_id=ec.mime_id_b
- group by pa.mime_id, pb.mime_id);
+ update exceptions_compared ec set
+ exc_cnt_a = (
+ select count(1) as cnt
+ from exceptions_a ea
+ join profiles_a pa on ea.id=pa.id
+ join profiles_b pb on pb.id=pa.id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+ group by ma.mime_id, mb.mime_id);
</sql>
<sql>
- update exceptions_compared ec set total_b=(
- select count(1) as cnt from profiles_b pb
+ update exceptions_compared ec set
+ exc_cnt_b = (
+ select count(1) as cnt
+ from exceptions_b eb
+ join profiles_b pb on eb.id=pb.id
join profiles_a pa on pa.id=pb.id
- where pa.mime_id= ec.mime_id_a
- and pb.mime_id=ec.mime_id_b
- group by pb.mime_id, pa.mime_id);
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+ group by mb.mime_id, ma.mime_id);
</sql>
<sql>
- update exceptions_compared ec set exceptions_a=
- ( select count(1) as cnt from exceptions_a ea
- join profiles_a pa on pa.id=ea.id
- join profiles_b pb on pb.id=pa.id
- where pa.mime_id= ec.mime_id_a
- and pb.mime_id=ec.mime_id_b
- and parse_exception_id=0
- group by pa.mime_id, pb.mime_id);
+ update exceptions_compared
+ set exc_prcnt_a = cast(exc_cnt_a as decimal)/cast(total as decimal)
+ where total > 0;
</sql>
<sql>
- update exceptions_compared ec set exceptions_b=
- ( select count(1) as cnt from exceptions_b eb
- join profiles_b pb on pb.id=eb.id
- join profiles_a pa on pa.id=pb.id
- where pa.mime_id= ec.mime_id_a
- and pb.mime_id=ec.mime_id_b
- and parse_exception_id=0
- group by pb.mime_id, pa.mime_id);
+ update exceptions_compared
+ set exc_prcnt_b = cast(exc_cnt_b as decimal)/cast(total as decimal)
+ where total > 0;
</sql>
<sql>
update exceptions_compared
- set percent_exceptions_a =
- (cast (exceptions_a as decimal))/(cast (total_a as decimal))
- where total_a > 0
+ set notes = 'YAY!'
+ where total > 100 and (exc_prcnt_a-exc_prcnt_b) > 0.10;
</sql>
<sql>
update exceptions_compared
- set percent_exceptions_b =
- (cast (exceptions_b as decimal))/(cast (total_b as decimal))
- where total_b > 0
+ set notes = 'YIKES!'
+ where total > 100 and (exc_prcnt_b-exc_prcnt_a) > 0.10;
</sql>
<!-- build tmp common words table -->
@@ -860,13 +859,16 @@
includeSql="true">
<sql>
- select ma.mime_string, mb.mime_string, exceptions_a,
- total_a, percent_exceptions_a,
- exceptions_b, total_b, percent_exceptions_b
- from exceptions_compared c
- join mimes ma on ma.mime_id=c.mime_id_a
- join mimes mb on mb.mime_id=c.mime_id_b
- order by percent_exceptions_b desc, total_b desc;
+ select ma.mime_string as mime_string_a, mb.mime_string as mime_string_b,
+ total, exc_cnt_a,
+ exc_cnt_b,
+ exc_prcnt_a,
+ exc_prcnt_b, notes
+
+ from exceptions_compared e
+ join mimes ma on ma.mime_id=e.mime_id_a
+ join mimes mb on mb.mime_id=e.mime_id_b
+ order by (exc_prcnt_b-exc_prcnt_a) desc, total desc;
</sql>
</report>
<!-- <report reportName="MD5 Duplicate Counts A"