You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/12/14 16:37:36 UTC
[tika] 01/02: TIKA-2798 -- improve reporting for attachment diffs
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 398bcd8566d3028a9554a459f5c49a51fb45528f
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Dec 14 11:16:18 2018 -0500
TIKA-2798 -- improve reporting for attachment diffs
---
.../src/main/resources/comparison-reports.xml | 40 +++++++++++++++++++---
1 file changed, 35 insertions(+), 5 deletions(-)
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index bba7f01..10fd9e3 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -929,8 +929,8 @@
</report>
-->
- <report reportName="Attachment Diffs"
- reportFilename="attachments/attachment_diffs.xlsx"
+ <report reportName="Attachment Diffs no Exceptions"
+ reportFilename="attachments/attachment_diffs_no_exceptions.xlsx"
format="xlsx"
includeSql="true">
@@ -941,8 +941,7 @@
mb.mime_string as MIME_STRING_B,
pa.num_attachments as NUM_ATTACHMENTS_A,
pb.num_attachments as NUM_ATTACHMENTS_B,
- ea.parse_exception_id as EXCEPTION_ID_A,
- eb.parse_exception_id as EXCEPTION_ID_B
+ pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B
from profiles_a pa
join profiles_b pb on pa.id= pb.id
join containers c on pa.container_id=c.container_id
@@ -955,7 +954,38 @@
eb.parse_exception_id is null
and pa.num_attachments <> pb.num_attachments
order by ma.mime_string, pb.num_attachments-pa.num_attachments
- limit 1000;
+ limit 10000;
+ </sql>
+ </report>
+
+ <report reportName="Attachment Diffs with exceptions"
+ reportFilename="attachments/attachment_diffs_with_exceptions.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
+ pa.num_attachments as NUM_ATTACHMENTS_A,
+ pb.num_attachments as NUM_ATTACHMENTS_B,
+ pb.num_attachments-pa.num_attachments as NUM_ATTACHMENTS_DIFF_IN_B,
+ refea.parse_exception_description as PARSE_EXCEPTION_A,
+ refeb.parse_exception_description as PARSE_EXCEPTION_B
+ from profiles_a pa
+ join profiles_b pb on pa.id= pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on pa.mime_id=ma.mime_id
+ join mimes mb on pb.mime_id=mb.mime_id
+ left join exceptions_a ea on ea.id=pa.id
+ left join exceptions_b eb on eb.id=pb.id
+ left join ref_parse_exception_types refea on ea.parse_exception_id=refea.parse_exception_id
+ left join ref_parse_exception_types refeb on eb.parse_exception_id=refeb.parse_exception_id
+ where pa.is_embedded=false
+ and pa.num_attachments <> pb.num_attachments
+ order by ma.mime_string, pb.num_attachments-pa.num_attachments
+ limit 10000;
</sql>
</report>