You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/04/11 21:09:01 UTC

[tika] branch master updated: TIKA-2852 -- add reports for missing files/attachments by mime

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 5156751  TIKA-2852 -- add reports for missing files/attachments by mime
5156751 is described below

commit 5156751a572f2abbea186987e70e0652ab695454
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Apr 11 17:08:47 2019 -0400

    TIKA-2852 -- add reports for missing files/attachments by mime
---
 .../src/main/resources/comparison-reports.xml      | 90 ++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index 9896832..c0f96aa 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -1459,6 +1459,96 @@
         </sql>
     </report>
 
+    <report reportName="Files missing in B by Mime"
+            reportFilename="attachments/all_files_missing_in_B_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_a pa
+            left join profiles_b pb on pa.id=pb.id
+            join mimes m on pa.mime_id=m.mime_id
+            where pb.id is null
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Container files missing in B by Mime"
+            reportFilename="attachments/container_files_missing_in_B_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_a pa
+            left join profiles_b pb on pa.id=pb.id
+            join mimes m on pa.mime_id=m.mime_id
+            where pb.id is null and pa.is_embedded=false
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Embedded files missing in B by Mime"
+            reportFilename="attachments/embedded_files_missing_in_B_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_a pa
+            left join profiles_b pb on pa.id=pb.id
+            join mimes m on pa.mime_id=m.mime_id
+            where pb.id is null and pa.is_embedded=true
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="All files missing in A by Mime"
+            reportFilename="attachments/all_files_missing_in_A_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_b pb
+            left join profiles_a pa on pb.id=pa.id
+            join mimes m on pb.mime_id=m.mime_id
+            where pa.id is null
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Container files missing in A by Mime"
+            reportFilename="attachments/container_files_missing_in_A_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_b pb
+            left join profiles_a pa on pb.id=pa.id
+            join mimes m on pb.mime_id=m.mime_id
+            where pa.id is null and pb.is_embedded=false
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Embedded files missing in A by Mime"
+            reportFilename="attachments/embedded_files_missing_in_A_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as cnt
+            from profiles_b pb
+            left join profiles_a pa on pb.id=pa.id
+            join mimes m on pb.mime_id=m.mime_id
+            where pa.id is null and pb.is_embedded=true
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
     <!-- metadata values -->
     <report reportName="Metadata Value Diffs"
             reportFilename="metadata/metadata_value_count_diffs.xlsx"