You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/06 17:58:15 UTC

[tika] branch 2.x updated: TIKA-2318

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/2.x by this push:
       new  96a8ddd   TIKA-2318
96a8ddd is described below

commit 96a8ddd84891d4773d8043df5e8b41ce07de9515
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Apr 6 13:58:07 2017 -0400

    TIKA-2318
---
 .../src/main/resources/comparison-reports.xml      | 103 +++++++++++++++------
 1 file changed, 74 insertions(+), 29 deletions(-)

diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index b447335..e59d474 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -134,71 +134,90 @@
         <sql>drop table if exists token_counts_compared</sql>
         <sql>
             create table token_counts_compared
-            (mime_type_id integer primary key,
+            (mime_type_id_a integer,
+            mime_type_id_b integer,
             num_tokens_a integer default 0,
-            num_alphabetic_tokens_a integer default 0,
-            num_common_tokens_a integer default 0,
             num_tokens_b integer default 0,
+            num_alphabetic_tokens_a integer default 0,
             num_alphabetic_tokens_b integer default 0,
+            num_common_tokens_a integer default 0,
             num_common_tokens_b integer default 0,
             );
         </sql>
         <sql>
-            insert into token_counts_compared (mime_type_id)
-            select mime_type_id from mimes;
+            insert into token_counts_compared (mime_type_id_a, mime_type_id_b)
+            select ma.mime_type_id, mb.mime_type_id
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_type_id=a.mime_type_id
+            join mimes mb on mb.mime_type_id=b.mime_type_id
+            group by ma.mime_type_id, mb.mime_type_id
+
         </sql>
 
         <sql>
             update token_counts_compared tcc set num_tokens_a=(
-            select sum(num_tokens) as cnt from profiles_a p
-            join contents_a c on c.id = p.id
-            where p.mime_type_id= tcc.mime_type_id
-            group by mime_type_id
+            select sum(num_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
             );
         </sql>
 
         <sql>
             update token_counts_compared tcc set num_tokens_b=(
-            select sum(num_tokens) as cnt from profiles_b p
-            join contents_a c on c.id = p.id
-            where p.mime_type_id= tcc.mime_type_id
-            group by mime_type_id
+            select sum(num_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pa.id=pb.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
             );
         </sql>
 
         <sql>
             update token_counts_compared tcc set num_alphabetic_tokens_a=(
-            select sum(num_alphabetic_tokens) as cnt from profiles_a p
-            join contents_a c on c.id = p.id
-            where p.mime_type_id= tcc.mime_type_id
-            group by mime_type_id
+            select sum(num_alphabetic_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
             );
         </sql>
 
         <sql>
             update token_counts_compared tcc set num_alphabetic_tokens_b=(
-            select sum(num_alphabetic_tokens) as cnt from profiles_b p
-            join contents_b c on c.id = p.id
-            where p.mime_type_id= tcc.mime_type_id
-            group by mime_type_id
+            select sum(num_alphabetic_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pb.id=pa.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
             );
         </sql>
 
         <sql>
             update token_counts_compared tcc set num_common_tokens_a=(
-            select sum(num_common_tokens) as cnt from profiles_a p
-            join contents_a c on c.id = p.id
-            where p.mime_type_id= tcc.mime_type_id
-            group by mime_type_id
+            select sum(num_common_tokens) as cnt from profiles_a pa
+            join profiles_b pb on pa.id=pb.id
+            join contents_a c on c.id = pa.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
             );
         </sql>
 
         <sql>
             update token_counts_compared tcc set num_common_tokens_b=(
-            select sum(num_common_tokens) as cnt from profiles_b p
-            join contents_b c on c.id = p.id
-            where p.mime_type_id= tcc.mime_type_id
-            group by mime_type_id
+            select sum(num_common_tokens) as cnt from profiles_b pb
+            join profiles_a pa on pa.id=pb.id
+            join contents_b c on c.id = pb.id
+            where pb.mime_type_id= tcc.mime_type_id_b
+            and pa.mime_type_id=tcc.mime_type_id_a
+            group by mime_type_id_a, mime_type_id_b
             );
         </sql>
 
@@ -326,6 +345,8 @@
         </sql>
     </report>
 
+
+    <!-- Exceptions -->
     <report reportName="AllExceptionsByMimeA"
             reportFilename="exceptions/exceptions_by_mime_A.xlsx"
             format="xlsx"
@@ -702,6 +723,8 @@
             ca.num_common_tokens as NUM_COMMON_TOKENS_A,
             cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
             cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+            ifnull(cb.num_common_tokens,0)-
+            ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
             ca.top_n_tokens as TOP_N_TOKENS_A,
             cb.top_n_tokens as TOP_N_TOKENS_B,
             ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
@@ -745,6 +768,8 @@
             ca.num_common_tokens as NUM_COMMON_TOKENS_A,
             cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
             cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+            ifnull(cb.num_common_tokens,0)-
+            ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B,
             ca.top_n_tokens as TOP_N_TOKENS_A,
             cb.top_n_tokens as TOP_N_TOKENS_B,
             ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
@@ -769,6 +794,26 @@
             limit 100000
         </sql>
     </report>
+
+    <report reportName="CommonTokenComparisonsByMimeType"
+            reportFilename="content/common_token_comparisons_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select ma.mime_string, mb.mime_string,
+            num_tokens_a, num_tokens_b
+            num_alphabetic_tokens_a, num_alphabetic_tokens_b,
+            num_common_tokens_a, num_common_tokens_b,
+            ifnull(num_common_tokens_b, 0)-ifnull(num_common_tokens_a, 0) as change_in_common_tokens_b
+            from token_counts_compared tcc
+            join mimes ma on tcc.mime_type_id_a = ma.mime_type_id
+            join mimes mb on tcc.mime_type_id_b = mb.mime_type_id
+            order by change_in_common_tokens_b desc
+        </sql>
+    </report>
+
+
     <report reportName="ExceptionComparisonsByMimeType"
             reportFilename="exceptions/exceptions_compared_by_mime_type.xlsx"
             format="xlsx"

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].