You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/12/14 16:37:36 UTC

[tika] 01/02: TIKA-2798 -- improve reporting for attachment diffs

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 398bcd8566d3028a9554a459f5c49a51fb45528f
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Dec 14 11:16:18 2018 -0500

    TIKA-2798 -- improve reporting for attachment diffs
---
 .../src/main/resources/comparison-reports.xml      | 40 +++++++++++++++++++---
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index bba7f01..10fd9e3 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -929,8 +929,8 @@
         </report>
     -->
 
-    <report reportName="Attachment Diffs"
-            reportFilename="attachments/attachment_diffs.xlsx"
+    <report reportName="Attachment Diffs no Exceptions"
+            reportFilename="attachments/attachment_diffs_no_exceptions.xlsx"
             format="xlsx"
             includeSql="true">
 
@@ -941,8 +941,7 @@
             mb.mime_string as MIME_STRING_B,
             pa.num_attachments as NUM_ATTACHMENTS_A,
             pb.num_attachments as NUM_ATTACHMENTS_B,
-            ea.parse_exception_id as EXCEPTION_ID_A,
-            eb.parse_exception_id as EXCEPTION_ID_B
+            pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B
             from profiles_a pa
             join profiles_b pb on pa.id= pb.id
             join containers c on pa.container_id=c.container_id
@@ -955,7 +954,38 @@
             eb.parse_exception_id is null
             and pa.num_attachments &lt;&gt; pb.num_attachments
             order by ma.mime_string, pb.num_attachments-pa.num_attachments
-            limit 1000;
+            limit 10000;
+        </sql>
+    </report>
+
+    <report reportName="Attachment Diffs with exceptions"
+            reportFilename="attachments/attachment_diffs_with_exceptions.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            c.length as CONTAINER_LENGTH,
+            ma.mime_string as MIME_STRING_A,
+            mb.mime_string as MIME_STRING_B,
+            pa.num_attachments as NUM_ATTACHMENTS_A,
+            pb.num_attachments as NUM_ATTACHMENTS_B,
+            pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B,
+            refea.parse_exception_description as PARSE_EXCEPTION_A,
+            refeb.parse_exception_description as PARSE_EXCEPTION_B
+            from profiles_a pa
+            join profiles_b pb on pa.id= pb.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on pa.mime_id=ma.mime_id
+            join mimes mb on pb.mime_id=mb.mime_id
+            left join exceptions_a ea on ea.id=pa.id
+            left join exceptions_b eb on eb.id=pb.id
+            left join ref_parse_exception_types refea on ea.parse_exception_id=refea.parse_exception_id
+            left join ref_parse_exception_types refeb on eb.parse_exception_id=refeb.parse_exception_id
+            where pa.is_embedded=false
+            and pa.num_attachments &lt;&gt; pb.num_attachments
+            order by ma.mime_string, pb.num_attachments-pa.num_attachments
+            limit 10000;
         </sql>
     </report>