You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/01/07 19:55:18 UTC

[tika] 01/02: TIKA-2809 -- add reports for tags; and add "b" tag.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b80c5174a1be94e28fa4051c85d53f22343c776d
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Jan 7 14:35:32 2019 -0500

    TIKA-2809 -- add reports for tags; and add "b" tag.
---
 CHANGES.txt                                        |   2 +
 .../org/apache/tika/eval/AbstractProfiler.java     |   1 +
 .../java/org/apache/tika/eval/ExtractProfiler.java |   1 +
 .../main/java/org/apache/tika/eval/db/Cols.java    |   1 +
 .../src/main/resources/comparison-reports.xml      | 533 ++++++++++++++++++++-
 tika-eval/src/main/resources/profile-reports.xml   |  61 +++
 6 files changed, 598 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 48107a1..938e0ea 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,8 @@ Release 2.0.0 - ???
 
 
 Release 1.21 - ????
+   * Add reports for tags in tika-eval (TIKA-2809).
+
    * Extract text from SDT element within textboxes in .docx files (TIKA-2807).
 
    * Try to handle truncated OOXML files more robustly (TIKA-2765).
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 307c54e..e15dc13 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -122,6 +122,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         //simplify this mess
         Map<String, Cols> tmp = new HashMap<>();
         tmp.put("A", Cols.TAGS_A);
+        tmp.put("B", Cols.TAGS_B);
         tmp.put("DIV", Cols.TAGS_DIV);
         tmp.put("I", Cols.TAGS_I);
         tmp.put("IMG", Cols.TAGS_IMG);
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index d1b1ac6..1ab67ae 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -158,6 +158,7 @@ public class ExtractProfiler extends AbstractProfiler {
     public static TableInfo TAGS_TABLE = new TableInfo("tags",
             new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
             new ColInfo(Cols.TAGS_A, Types.INTEGER),
+            new ColInfo(Cols.TAGS_B, Types.INTEGER),
             new ColInfo(Cols.TAGS_DIV, Types.INTEGER),
             new ColInfo(Cols.TAGS_I, Types.INTEGER),
             new ColInfo(Cols.TAGS_IMG, Types.INTEGER),
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index db8c1d0..f3b212c 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -94,6 +94,7 @@ public enum Cols {
     
     //structure tags
     TAGS_A,
+    TAGS_B,
     TAGS_DIV,
     TAGS_I,
     TAGS_IMG,
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index eaf3bb6..7c20ffd 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -218,6 +218,453 @@
             );
         </sql>
 
+        <sql>drop table if exists tags_by_mime</sql>
+        <sql>create table tags_by_mime (
+                mime_id_a integer,
+                mime_id_b integer,
+                tags_a_a integer,
+                tags_b_a integer,
+                tags_div_a integer,
+                tags_i_a integer,
+                tags_img_a integer,
+                tags_li_a integer,
+                tags_ol_a integer,
+                tags_p_a integer,
+                tags_table_a integer,
+                tags_td_a integer,
+                tags_title_a integer,
+                tags_tr_a integer,
+                tags_u_a integer,
+                tags_ul_a integer,
+                tags_a_b integer,
+                tags_b_b integer,
+                tags_div_b integer,
+                tags_i_b integer,
+                tags_img_b integer,
+                tags_li_b integer,
+                tags_ol_b integer,
+                tags_p_b integer,
+                tags_table_b integer,
+                tags_td_b integer,
+                tags_title_b integer,
+                tags_tr_b integer,
+                tags_u_b integer,
+                tags_ul_b integer
+            );
+        </sql>
+        <sql>
+            insert into tags_by_mime (mime_id_a, mime_id_b)
+            select ma.mime_id, mb.mime_id
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            group by ma.mime_id, mb.mime_id
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_a_a=(
+            select sum(ta.tags_a) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_b_a=(
+            select sum(ta.tags_b) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_div_a=(
+            select sum(ta.tags_div) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_i_a=(
+            select sum(ta.tags_i) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_img_a=(
+            select sum(ta.tags_img) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_li_a=(
+            select sum(ta.tags_li) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ol_a=(
+            select sum(ta.tags_ol) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_p_a=(
+            select sum(ta.tags_p) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_table_a=(
+            select sum(ta.tags_table) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_td_a=(
+            select sum(ta.tags_td) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_title_a=(
+            select sum(ta.tags_title) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_tr_a=(
+            select sum(ta.tags_tr) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_u_a=(
+            select sum(ta.tags_u) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ul_a=(
+            select sum(ta.tags_ul) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <!-- now update tags_b counts -->
+        <sql>
+            update tags_by_mime tbm set tags_a_b=(
+            select sum(tb.tags_a) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_b_b=(
+            select sum(tb.tags_b) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_div_b=(
+            select sum(tb.tags_div) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_i_b=(
+            select sum(tb.tags_i) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_img_b=(
+            select sum(tb.tags_img) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_li_b=(
+            select sum(tb.tags_li) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ol_b=(
+            select sum(tb.tags_ol) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_p_b=(
+            select sum(tb.tags_p) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_table_b=(
+            select sum(tb.tags_table) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_td_b=(
+            select sum(tb.tags_td) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_title_b=(
+            select sum(tb.tags_title) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_tr_b=(
+            select sum(tb.tags_tr) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_u_b=(
+            select sum(tb.tags_u) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ul_b=(
+            select sum(tb.tags_ul) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>drop table if exists tag_exceptions_by_mime</sql>
+        <sql>create table tag_exceptions_by_mime (
+            mime_id_a integer,
+            mime_id_b integer,
+            tag_exceptions_a integer,
+            tag_exceptions_b integer)
+        </sql>
+        <sql>
+            insert into tag_exceptions_by_mime (mime_id_a, mime_id_b,
+                tag_exceptions_a, tag_exceptions_b)
+            select ma.mime_id, mb.mime_id,0,0
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            group by ma.mime_id, mb.mime_id
+        </sql>
+        <sql>
+            update tag_exceptions_by_mime tebm set tag_exceptions_a=(
+            select count(1) as cnt from tags_a ta
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tebm.mime_id_b
+            and pa.mime_id=tebm.mime_id_a
+            and ta.tags_parse_exception=true
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tag_exceptions_by_mime tebm set tag_exceptions_b=(
+            select count(1) as cnt from tags_b tb
+            join profiles_a pa on pa.id=tb.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tebm.mime_id_b
+            and pa.mime_id=tebm.mime_id_a
+            and tb.tags_parse_exception=true
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
     </before>
 
     <!-- MIMES -->
@@ -1018,8 +1465,92 @@
             pb.num_metadata_values-pa.num_metadata_values
         </sql>
     </report>
+    <report reportName="Tag Count Diffs By Mime"
+            reportFilename="tags/tag_count_diffs_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select ma.mime_string as mime_string_a,
+            mb.mime_string as mime_string_b,
+            tags_a_a,
+            tags_a_b,
+            tags_b_a,
+            tags_b_b,
+            tags_div_a,
+            tags_div_b,
+            tags_i_a,
+            tags_i_b,
+            tags_li_a,
+            tags_li_b,
+            tags_ol_a,
+            tags_ol_b,
+            tags_p_a,
+            tags_p_b,
+            tags_table_a,
+            tags_table_b,
+            tags_td_a,
+            tags_td_b,
+            tags_title_a,
+            tags_title_b,
+            tags_tr_a,
+            tags_tr_b,
+            tags_u_a,
+            tags_u_b,
+            tags_ul_a,
+            tags_ul_b
+            from
+            tags_by_mime tbm
+            join mimes ma on tbm.mime_id_a=ma.mime_id
+            join mimes mb on tbm.mime_id_b=mb.mime_id
+        </sql>
 
-    <after>
+    </report>
+    <report reportName="Tag Exceptions By Mime"
+            reportFilename="tags/tag_exceptions_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select ma.mime_string as mime_string_a,
+            mb.mime_string as mime_string_b,
+            tag_exceptions_a,
+            tag_exceptions_b,
+            (tag_exceptions_b-tag_exceptions_a) as diff_tag_exceptions_in_b
+            from tag_exceptions_by_mime tebm
+            join mimes ma on tebm.mime_id_a=ma.mime_id
+            join mimes mb on tebm.mime_id_b=mb.mime_id
+            order by diff_tag_exceptions_in_b desc
+        </sql>
+    </report>
+    <report reportName="Tag Exceptions Details A"
+                         reportFilename="tags/tag_exceptions_details_a.xlsx"
+                         format="xlsx"
+                         includeSql="true">
+        <sql>
+            select c.file_path,pa.file_name,mime_string,is_embedded from
+            tags_a ta
+            join profiles_a pa on ta.id=pa.id
+            join containers c on pa.container_id=c.container_id
+            join mimes m on pa.mime_id=m.mime_id
+            where ta.tags_parse_exception=true
+            order by m.mime_string
+            limit 20000
+        </sql>
+    </report>
+    <report reportName="Tag Exceptions Details B"
+            reportFilename="tags/tag_exceptions_details_b.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select c.file_path,pb.file_name,mime_string,is_embedded from
+            tags_b tb
+            join profiles_b pb on tb.id=pb.id
+            join containers c on pb.container_id=c.container_id
+            join mimes m on pb.mime_id=m.mime_id
+            where tb.tags_parse_exception=true
+            order by m.mime_string
+            limit 20000
+        </sql>
+    </report>    <after>
         <sql>drop table if exists md5_multiples_tmp_a</sql>
         <sql>drop table if exists md5_multiples_tmp_b</sql>
     </after>
diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml
index d31606f..028a7f4 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -260,6 +260,67 @@
             CONTAINER_LENGTH asc
         </sql>
     </report>
+    <report reportName="TagExceptionsByMime"
+        reportFilename="tags/tag_exceptions_by_mime.xlsx"
+        format="xlsx"
+        includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as CNT
+            from tags t
+            join profiles p on p.id=t.id
+            join mimes m on p.mime_id=m.mime_id
+            where tags_parse_exception=TRUE
+            group by mime_string
+            order by CNT desc
+        </sql>
+    </report>
+    <report reportName="Tag Exceptions Details"
+            reportFilename="tags/tag_exceptions_details.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select c.file_path,p.file_name,mime_string,is_embedded from
+            tags t
+            join profiles p on t.id=p.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on p.mime_id=m.mime_id
+            where t.tags_parse_exception=true
+            order by m.mime_string
+            limit 20000
+        </sql>
+    </report>
+    <report reportName="Tags by Mime"
+            reportFilename="tags/tags_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string,
+            sum(tags_a) as tags_a,
+            sum(tags_b) as tags_b,
+            sum(tags_div) as tags_div,
+            sum(tags_i) as tags_i,
+            sum(tags_img) as tags_img,
+            sum(tags_li) as tags_li,
+            sum(tags_ol) as tags_ol,
+            sum(tags_p) as tags_p,
+            sum(tags_table) as tags_table,
+            sum(tags_td) as tags_td,
+            sum(tags_title) as tags_title,
+            sum(tags_tr) as tags_tr,
+            sum(tags_u) as tags_u,
+            sum(tags_ul) as tags_ul
+
+            from tags t
+            join profiles p on t.id=p.id
+            join mimes m on p.mime_id=m.mime_id
+            where tags_parse_exception=false
+            group by m.mime_id
+        </sql>
+
+    </report>
     <after>
 
         <!--<sql>drop index on x</sql>