You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/01/07 19:55:17 UTC

[tika] branch master updated (f3fac43 -> 517adc9)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from f3fac43  TIKA-2808 -- exclude h2 from ossindex-maven-plugin
     new b80c517  TIKA-2809 -- add reports for tags; and add "b" tag.
     new 517adc9  TIKA-2810 -- handle bad tags more robustly

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   5 +
 .../org/apache/tika/eval/AbstractProfiler.java     |  12 +-
 .../java/org/apache/tika/eval/ExtractProfiler.java |   1 +
 .../main/java/org/apache/tika/eval/db/Cols.java    |   1 +
 .../org/apache/tika/eval/util/ContentTags.java     |   6 +-
 .../src/main/resources/comparison-reports.xml      | 533 ++++++++++++++++++++-
 tika-eval/src/main/resources/profile-reports.xml   |  61 +++
 .../org/apache/tika/eval/SimpleComparerTest.java   |  10 +
 8 files changed, 625 insertions(+), 4 deletions(-)


[tika] 01/02: TIKA-2809 -- add reports for tags; and add "b" tag.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b80c5174a1be94e28fa4051c85d53f22343c776d
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Jan 7 14:35:32 2019 -0500

    TIKA-2809 -- add reports for tags; and add "b" tag.
---
 CHANGES.txt                                        |   2 +
 .../org/apache/tika/eval/AbstractProfiler.java     |   1 +
 .../java/org/apache/tika/eval/ExtractProfiler.java |   1 +
 .../main/java/org/apache/tika/eval/db/Cols.java    |   1 +
 .../src/main/resources/comparison-reports.xml      | 533 ++++++++++++++++++++-
 tika-eval/src/main/resources/profile-reports.xml   |  61 +++
 6 files changed, 598 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 48107a1..938e0ea 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,8 @@ Release 2.0.0 - ???
 
 
 Release 1.21 - ????
+   * Add reports for tags in tika-eval (TIKA-2809).
+
    * Extract text from SDT element within textboxes in .docx files (TIKA-2807).
 
    * Try to handle truncated OOXML files more robustly (TIKA-2765).
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 307c54e..e15dc13 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -122,6 +122,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         //simplify this mess
         Map<String, Cols> tmp = new HashMap<>();
         tmp.put("A", Cols.TAGS_A);
+        tmp.put("B", Cols.TAGS_B);
         tmp.put("DIV", Cols.TAGS_DIV);
         tmp.put("I", Cols.TAGS_I);
         tmp.put("IMG", Cols.TAGS_IMG);
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index d1b1ac6..1ab67ae 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -158,6 +158,7 @@ public class ExtractProfiler extends AbstractProfiler {
     public static TableInfo TAGS_TABLE = new TableInfo("tags",
             new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
             new ColInfo(Cols.TAGS_A, Types.INTEGER),
+            new ColInfo(Cols.TAGS_B, Types.INTEGER),
             new ColInfo(Cols.TAGS_DIV, Types.INTEGER),
             new ColInfo(Cols.TAGS_I, Types.INTEGER),
             new ColInfo(Cols.TAGS_IMG, Types.INTEGER),
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index db8c1d0..f3b212c 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -94,6 +94,7 @@ public enum Cols {
     
     //structure tags
     TAGS_A,
+    TAGS_B,
     TAGS_DIV,
     TAGS_I,
     TAGS_IMG,
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index eaf3bb6..7c20ffd 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -218,6 +218,453 @@
             );
         </sql>
 
+        <sql>drop table if exists tags_by_mime</sql>
+        <sql>create table tags_by_mime (
+                mime_id_a integer,
+                mime_id_b integer,
+                tags_a_a integer,
+                tags_b_a integer,
+                tags_div_a integer,
+                tags_i_a integer,
+                tags_img_a integer,
+                tags_li_a integer,
+                tags_ol_a integer,
+                tags_p_a integer,
+                tags_table_a integer,
+                tags_td_a integer,
+                tags_title_a integer,
+                tags_tr_a integer,
+                tags_u_a integer,
+                tags_ul_a integer,
+                tags_a_b integer,
+                tags_b_b integer,
+                tags_div_b integer,
+                tags_i_b integer,
+                tags_img_b integer,
+                tags_li_b integer,
+                tags_ol_b integer,
+                tags_p_b integer,
+                tags_table_b integer,
+                tags_td_b integer,
+                tags_title_b integer,
+                tags_tr_b integer,
+                tags_u_b integer,
+                tags_ul_b integer
+            );
+        </sql>
+        <sql>
+            insert into tags_by_mime (mime_id_a, mime_id_b)
+            select ma.mime_id, mb.mime_id
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            group by ma.mime_id, mb.mime_id
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_a_a=(
+            select sum(ta.tags_a) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_b_a=(
+            select sum(ta.tags_b) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_div_a=(
+            select sum(ta.tags_div) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_i_a=(
+            select sum(ta.tags_i) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_img_a=(
+            select sum(ta.tags_img) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_li_a=(
+            select sum(ta.tags_li) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ol_a=(
+            select sum(ta.tags_ol) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_p_a=(
+            select sum(ta.tags_p) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_table_a=(
+            select sum(ta.tags_table) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_td_a=(
+            select sum(ta.tags_td) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_title_a=(
+            select sum(ta.tags_title) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_tr_a=(
+            select sum(ta.tags_tr) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_u_a=(
+            select sum(ta.tags_u) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ul_a=(
+            select sum(ta.tags_ul) as cnt from tags_a ta
+            join tags_b tb on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <!-- now update tags_b counts -->
+        <sql>
+            update tags_by_mime tbm set tags_a_b=(
+            select sum(tb.tags_a) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_b_b=(
+            select sum(tb.tags_b) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_div_b=(
+            select sum(tb.tags_div) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_i_b=(
+            select sum(tb.tags_i) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_img_b=(
+            select sum(tb.tags_img) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_li_b=(
+            select sum(tb.tags_li) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ol_b=(
+            select sum(tb.tags_ol) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_p_b=(
+            select sum(tb.tags_p) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_table_b=(
+            select sum(tb.tags_table) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_td_b=(
+            select sum(tb.tags_td) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_title_b=(
+            select sum(tb.tags_title) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_tr_b=(
+            select sum(tb.tags_tr) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_u_b=(
+            select sum(tb.tags_u) as cnt from tags_b tb
+            join tags_a ta on tb.id=ta.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tags_by_mime tbm set tags_ul_b=(
+            select sum(tb.tags_ul) as cnt from tags_b tb
+            join tags_a ta on ta.id=tb.id
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tbm.mime_id_b
+            and pa.mime_id=tbm.mime_id_a
+            and ta.tags_parse_exception=false
+            and tb.tags_parse_exception=false
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>drop table if exists tag_exceptions_by_mime</sql>
+        <sql>create table tag_exceptions_by_mime (
+            mime_id_a integer,
+            mime_id_b integer,
+            tag_exceptions_a integer,
+            tag_exceptions_b integer)
+        </sql>
+        <sql>
+            insert into tag_exceptions_by_mime (mime_id_a, mime_id_b,
+                tag_exceptions_a, tag_exceptions_b)
+            select ma.mime_id, mb.mime_id,0,0
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_id=a.mime_id
+            join mimes mb on mb.mime_id=b.mime_id
+            group by ma.mime_id, mb.mime_id
+        </sql>
+        <sql>
+            update tag_exceptions_by_mime tebm set tag_exceptions_a=(
+            select count(1) as cnt from tags_a ta
+            join profiles_a pa on pa.id=ta.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tebm.mime_id_b
+            and pa.mime_id=tebm.mime_id_a
+            and ta.tags_parse_exception=true
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
+        <sql>
+            update tag_exceptions_by_mime tebm set tag_exceptions_b=(
+            select count(1) as cnt from tags_b tb
+            join profiles_a pa on pa.id=tb.id
+            join profiles_b pb on pa.id=pb.id
+            where pb.mime_id= tebm.mime_id_b
+            and pa.mime_id=tebm.mime_id_a
+            and tb.tags_parse_exception=true
+            group by mime_id_a, mime_id_b
+            );
+        </sql>
     </before>
 
     <!-- MIMES -->
@@ -1018,8 +1465,92 @@
             pb.num_metadata_values-pa.num_metadata_values
         </sql>
     </report>
+    <report reportName="Tag Count Diffs By Mime"
+            reportFilename="tags/tag_count_diffs_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select ma.mime_string as mime_string_a,
+            mb.mime_string as mime_string_b,
+            tags_a_a,
+            tags_a_b,
+            tags_b_a,
+            tags_b_b,
+            tags_div_a,
+            tags_div_b,
+            tags_i_a,
+            tags_i_b,
+            tags_li_a,
+            tags_li_b,
+            tags_ol_a,
+            tags_ol_b,
+            tags_p_a,
+            tags_p_b,
+            tags_table_a,
+            tags_table_b,
+            tags_td_a,
+            tags_td_b,
+            tags_title_a,
+            tags_title_b,
+            tags_tr_a,
+            tags_tr_b,
+            tags_u_a,
+            tags_u_b,
+            tags_ul_a,
+            tags_ul_b
+            from
+            tags_by_mime tbm
+            join mimes ma on tbm.mime_id_a=ma.mime_id
+            join mimes mb on tbm.mime_id_b=mb.mime_id
+        </sql>
 
-    <after>
+    </report>
+    <report reportName="Tag Exceptions By Mime"
+            reportFilename="tags/tag_exceptions_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select ma.mime_string as mime_string_a,
+            mb.mime_string as mime_string_b,
+            tag_exceptions_a,
+            tag_exceptions_b,
+            (tag_exceptions_b-tag_exceptions_a) as diff_tag_exceptions_in_b
+            from tag_exceptions_by_mime tebm
+            join mimes ma on tebm.mime_id_a=ma.mime_id
+            join mimes mb on tebm.mime_id_b=mb.mime_id
+            order by diff_tag_exceptions_in_b desc
+        </sql>
+    </report>
+    <report reportName="Tag Exceptions Details A"
+                         reportFilename="tags/tag_exceptions_details_a.xlsx"
+                         format="xlsx"
+                         includeSql="true">
+        <sql>
+            select c.file_path,pa.file_name,mime_string,is_embedded from
+            tags_a ta
+            join profiles_a pa on ta.id=pa.id
+            join containers c on pa.container_id=c.container_id
+            join mimes m on pa.mime_id=m.mime_id
+            where ta.tags_parse_exception=true
+            order by m.mime_string
+            limit 20000
+        </sql>
+    </report>
+    <report reportName="Tag Exceptions Details B"
+            reportFilename="tags/tag_exceptions_details_b.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select c.file_path,pb.file_name,mime_string,is_embedded from
+            tags_b tb
+            join profiles_b pb on tb.id=pb.id
+            join containers c on pb.container_id=c.container_id
+            join mimes m on pb.mime_id=m.mime_id
+            where tb.tags_parse_exception=true
+            order by m.mime_string
+            limit 20000
+        </sql>
+    </report>    <after>
         <sql>drop table if exists md5_multiples_tmp_a</sql>
         <sql>drop table if exists md5_multiples_tmp_b</sql>
     </after>
diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml
index d31606f..028a7f4 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -260,6 +260,67 @@
             CONTAINER_LENGTH asc
         </sql>
     </report>
+    <report reportName="TagExceptionsByMime"
+        reportFilename="tags/tag_exceptions_by_mime.xlsx"
+        format="xlsx"
+        includeSql="true">
+
+        <sql>
+            select mime_string, count(1) as CNT
+            from tags t
+            join profiles p on p.id=t.id
+            join mimes m on p.mime_id=m.mime_id
+            where tags_parse_exception=TRUE
+            group by mime_string
+            order by CNT desc
+        </sql>
+    </report>
+    <report reportName="Tag Exceptions Details"
+            reportFilename="tags/tag_exceptions_details.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select c.file_path,p.file_name,mime_string,is_embedded from
+            tags t
+            join profiles p on t.id=p.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on p.mime_id=m.mime_id
+            where t.tags_parse_exception=true
+            order by m.mime_string
+            limit 20000
+        </sql>
+    </report>
+    <report reportName="Tags by Mime"
+            reportFilename="tags/tags_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string,
+            sum(tags_a) as tags_a,
+            sum(tags_b) as tags_b,
+            sum(tags_div) as tags_div,
+            sum(tags_i) as tags_i,
+            sum(tags_img) as tags_img,
+            sum(tags_li) as tags_li,
+            sum(tags_ol) as tags_ol,
+            sum(tags_p) as tags_p,
+            sum(tags_table) as tags_table,
+            sum(tags_td) as tags_td,
+            sum(tags_title) as tags_title,
+            sum(tags_tr) as tags_tr,
+            sum(tags_u) as tags_u,
+            sum(tags_ul) as tags_ul
+
+            from tags t
+            join profiles p on t.id=p.id
+            join mimes m on p.mime_id=m.mime_id
+            where tags_parse_exception=false
+            group by m.mime_id
+        </sql>
+
+    </report>
     <after>
 
         <!--<sql>drop index on x</sql>


[tika] 02/02: TIKA-2810 -- handle bad tags more robustly

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 517adc9d7056e90f48d132e27bfe3f44b8453338
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Jan 7 14:55:05 2019 -0500

    TIKA-2810 -- handle bad tags more robustly
---
 CHANGES.txt                                                   |  3 +++
 .../src/main/java/org/apache/tika/eval/AbstractProfiler.java  | 11 +++++++++--
 .../src/main/java/org/apache/tika/eval/util/ContentTags.java  |  6 +++++-
 .../test/java/org/apache/tika/eval/SimpleComparerTest.java    | 10 ++++++++++
 4 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 938e0ea..df2559b 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,9 @@ Release 2.0.0 - ???
 
 
 Release 1.21 - ????
+
+   * Handle bad tags in tika-eval more robustly (TIKA-2810).
+
    * Add reports for tags in tika-eval (TIKA-2809).
 
    * Extract text from SDT element within textboxes in .docx files (TIKA-2807).
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index e15dc13..3a633f7 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -847,9 +847,16 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
             try {
                 return ContentTagParser.parseXML(s, UC_TAGS_OF_INTEREST.keySet());
             } catch (TikaException|IOException|SAXException e) {
-                LOG.warn("Problem parsing xhtml in {}; backing off to treat string as text",
+                LOG.warn("Problem parsing xhtml in {}; backing off to html parser",
                         evalFilePaths.getExtractFile().toAbsolutePath().toString(), e);
-
+                try {
+                    ContentTags contentTags = ContentTagParser.parseHTML(s, UC_TAGS_OF_INTEREST.keySet());
+                    contentTags.setParseException(true);
+                    return contentTags;
+                } catch (IOException|SAXException e2) {
+                    LOG.warn("Problem parsing html in {}; backing off to treat string as text",
+                            evalFilePaths.getExtractFile().toAbsolutePath().toString(), e2);
+                }
                 return new ContentTags(s, true);
             }
         }
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
index 115976f..3f8c9a5 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/util/ContentTags.java
@@ -24,7 +24,7 @@ public class ContentTags {
     public static final ContentTags EMPTY_CONTENT_TAGS = new ContentTags();
     final Map<String, Integer> tags;
     final String content;
-    final boolean parseException;
+    boolean parseException;
 
     private ContentTags() {
         this("", Collections.EMPTY_MAP, false);
@@ -60,4 +60,8 @@ public class ContentTags {
     public boolean getParseException() {
         return parseException;
     }
+
+    public void setParseException(boolean parseException) {
+        this.parseException = parseException;
+    }
 }
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
index d54d41c..96286be 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -346,6 +346,16 @@ public class SimpleComparerTest extends TikaTest {
         assertEquals(1, tableInfosA.size());
         Map<Cols, String> tableInfoA = tableInfosA.get(0);
         assertEquals("true", tableInfoA.get(Cols.TAGS_PARSE_EXCEPTION));
+
+        //confirm that backoff to html parser worked
+        List<Map<Cols, String>> contentsA = WRITER.getTable(ExtractComparer.CONTENTS_TABLE_A);
+        assertEquals(1, contentsA.size());
+        Map<Cols, String> contentsARow1 = contentsA.get(0);
+        String topN = contentsARow1.get(Cols.TOP_N_TOKENS);
+        assertNotContained("content:", topN);
+        assertNotContained(" p: ", topN);
+        assertContains("apache: 12", topN);
+
     }
 
     @Test