You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/09/25 16:21:33 UTC

[tika] branch master updated: TIKA-2736 -- improve reports for comparisons

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new e8bcfce  TIKA-2736 -- improve reports for comparisons
e8bcfce is described below

commit e8bcfced7acd0f3f3f5d253c529fe4f96277b2d6
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue Sep 25 12:21:20 2018 -0400

    TIKA-2736 -- improve reports for comparisons
---
 .../src/main/resources/comparison-reports.xml      | 110 +++++++++++----------
 1 file changed, 56 insertions(+), 54 deletions(-)

diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index 06b5c28..bba7f01 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -59,73 +59,72 @@
         <!-- build exceptions comparison table -->
         <sql>drop table if exists exceptions_compared</sql>
         <sql>
-            create table exceptions_compared
-            (mime_id_a integer, mime_id_b integer,
-            exceptions_a integer default 0,
-            total_a integer default 0,
-            percent_exceptions_a double default 0.0,
-            exceptions_b integer default 0,
-            total_b integer default 0,
-            percent_exceptions_b double default 0.0);
+            create table exceptions_compared (
+            mime_id_a integer,
+            mime_id_b integer,
+            total integer,
+            exc_cnt_a integer,
+            exc_cnt_b integer,
+            exc_prcnt_a float,
+            exc_prcnt_b float,
+            notes varchar(12)
+            );
         </sql>
         <sql>
-            insert into exceptions_compared (mime_id_a, mime_id_b)
-            select ma.mime_id, mb.mime_id
-            from profiles_a a
-            join profiles_b b on a.id=b.id
-            join mimes ma on ma.mime_id=a.mime_id
-            join mimes mb on mb.mime_id=b.mime_id
+            insert into exceptions_compared (
+            select ma.mime_id, mb.mime_id, count(1) as total, 0, 0, 0.0, 0.0, ''
+            from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join mimes ma on pa.mime_id = ma.mime_id
+            join mimes mb on pb.mime_id = mb.mime_id
             group by ma.mime_id, mb.mime_id
+            order by total desc );
         </sql>
 
         <sql>
-            update exceptions_compared ec set total_a=(
-            select count(1) as cnt from profiles_a pa
-            join profiles_b pb on pa.id=pb.id
-            where pa.mime_id= ec.mime_id_a
-            and pb.mime_id=ec.mime_id_b
-            group by pa.mime_id, pb.mime_id);
+            update exceptions_compared ec set
+            exc_cnt_a = (
+            select count(1) as cnt
+            from exceptions_a ea
+            join profiles_a pa on ea.id=pa.id
+            join profiles_b pb on pb.id=pa.id
+            join mimes ma on pa.mime_id=ma.mime_id
+            join mimes mb on pb.mime_id=mb.mime_id
+            where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+            group by ma.mime_id, mb.mime_id);
         </sql>
         <sql>
-            update exceptions_compared ec set total_b=(
-            select count(1) as cnt from profiles_b pb
+            update exceptions_compared ec set
+            exc_cnt_b = (
+            select count(1) as cnt
+            from exceptions_b eb
+            join profiles_b pb on eb.id=pb.id
             join profiles_a pa on pa.id=pb.id
-            where pa.mime_id= ec.mime_id_a
-            and pb.mime_id=ec.mime_id_b
-            group by pb.mime_id, pa.mime_id);
+            join mimes ma on pa.mime_id=ma.mime_id
+            join mimes mb on pb.mime_id=mb.mime_id
+            where ma.mime_id= ec.mime_id_a and mb.mime_id=ec.mime_id_b
+            group by mb.mime_id, ma.mime_id);
         </sql>
         <sql>
-            update exceptions_compared ec set exceptions_a=
-            ( select count(1) as cnt from exceptions_a ea
-            join profiles_a pa on pa.id=ea.id
-            join profiles_b pb on pb.id=pa.id
-            where pa.mime_id= ec.mime_id_a
-            and pb.mime_id=ec.mime_id_b
-            and parse_exception_id=0
-            group by pa.mime_id, pb.mime_id);
+            update exceptions_compared
+            set exc_prcnt_a = cast(exc_cnt_a as decimal)/cast(total as decimal)
+            where total > 0;
         </sql>
         <sql>
-            update exceptions_compared ec set exceptions_b=
-            ( select count(1) as cnt from exceptions_b eb
-            join profiles_b pb on pb.id=eb.id
-            join profiles_a pa on pa.id=pb.id
-            where pa.mime_id= ec.mime_id_a
-            and pb.mime_id=ec.mime_id_b
-            and parse_exception_id=0
-            group by pb.mime_id, pa.mime_id);
+            update exceptions_compared
+            set exc_prcnt_b = cast(exc_cnt_b as decimal)/cast(total as decimal)
+            where total > 0;
         </sql>
 
         <sql>
             update exceptions_compared
-            set percent_exceptions_a =
-            (cast (exceptions_a as decimal))/(cast (total_a as decimal))
-            where total_a &gt; 0
+            set notes = 'YAY!'
+            where total > 100 and (exc_prcnt_a-exc_prcnt_b) > 0.10;
         </sql>
         <sql>
             update exceptions_compared
-            set percent_exceptions_b =
-            (cast (exceptions_b as decimal))/(cast (total_b as decimal))
-            where total_b &gt; 0
+            set notes = 'YIKES!'
+            where total > 100 and (exc_prcnt_b-exc_prcnt_a) > 0.10;
         </sql>
 
         <!-- build tmp common words table -->
@@ -860,13 +859,16 @@
             includeSql="true">
 
         <sql>
-            select ma.mime_string, mb.mime_string, exceptions_a,
-            total_a, percent_exceptions_a,
-            exceptions_b, total_b, percent_exceptions_b
-            from exceptions_compared c
-            join mimes ma on ma.mime_id=c.mime_id_a
-            join mimes mb on mb.mime_id=c.mime_id_b
-            order by percent_exceptions_b desc, total_b desc;
+            select ma.mime_string as mime_string_a, mb.mime_string as mime_string_b,
+            total, exc_cnt_a,
+            exc_cnt_b,
+            exc_prcnt_a,
+            exc_prcnt_b, notes
+
+            from exceptions_compared e
+            join mimes ma on ma.mime_id=e.mime_id_a
+            join mimes mb on mb.mime_id=e.mime_id_b
+            order by (exc_prcnt_b-exc_prcnt_a) desc, total desc;
         </sql>
     </report>
     <!--    <report reportName="MD5 Duplicate Counts A"