You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/01/07 19:55:18 UTC
[tika] 01/02: TIKA-2809 -- add reports for tags; and add "b" tag.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit b80c5174a1be94e28fa4051c85d53f22343c776d
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Jan 7 14:35:32 2019 -0500
TIKA-2809 -- add reports for tags; and add "b" tag.
---
CHANGES.txt | 2 +
.../org/apache/tika/eval/AbstractProfiler.java | 1 +
.../java/org/apache/tika/eval/ExtractProfiler.java | 1 +
.../main/java/org/apache/tika/eval/db/Cols.java | 1 +
.../src/main/resources/comparison-reports.xml | 533 ++++++++++++++++++++-
tika-eval/src/main/resources/profile-reports.xml | 61 +++
6 files changed, 598 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 48107a1..938e0ea 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,8 @@ Release 2.0.0 - ???
Release 1.21 - ????
+ * Add reports for tags in tika-eval (TIKA-2809).
+
* Extract text from SDT element within textboxes in .docx files (TIKA-2807).
* Try to handle truncated OOXML files more robustly (TIKA-2765).
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index 307c54e..e15dc13 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -122,6 +122,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
//simplify this mess
Map<String, Cols> tmp = new HashMap<>();
tmp.put("A", Cols.TAGS_A);
+ tmp.put("B", Cols.TAGS_B);
tmp.put("DIV", Cols.TAGS_DIV);
tmp.put("I", Cols.TAGS_I);
tmp.put("IMG", Cols.TAGS_IMG);
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index d1b1ac6..1ab67ae 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -158,6 +158,7 @@ public class ExtractProfiler extends AbstractProfiler {
public static TableInfo TAGS_TABLE = new TableInfo("tags",
new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
new ColInfo(Cols.TAGS_A, Types.INTEGER),
+ new ColInfo(Cols.TAGS_B, Types.INTEGER),
new ColInfo(Cols.TAGS_DIV, Types.INTEGER),
new ColInfo(Cols.TAGS_I, Types.INTEGER),
new ColInfo(Cols.TAGS_IMG, Types.INTEGER),
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index db8c1d0..f3b212c 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -94,6 +94,7 @@ public enum Cols {
//structure tags
TAGS_A,
+ TAGS_B,
TAGS_DIV,
TAGS_I,
TAGS_IMG,
diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml
index eaf3bb6..7c20ffd 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -218,6 +218,453 @@
);
</sql>
+ <sql>drop table if exists tags_by_mime</sql>
+ <sql>create table tags_by_mime (
+ mime_id_a integer,
+ mime_id_b integer,
+ tags_a_a integer,
+ tags_b_a integer,
+ tags_div_a integer,
+ tags_i_a integer,
+ tags_img_a integer,
+ tags_li_a integer,
+ tags_ol_a integer,
+ tags_p_a integer,
+ tags_table_a integer,
+ tags_td_a integer,
+ tags_title_a integer,
+ tags_tr_a integer,
+ tags_u_a integer,
+ tags_ul_a integer,
+ tags_a_b integer,
+ tags_b_b integer,
+ tags_div_b integer,
+ tags_i_b integer,
+ tags_img_b integer,
+ tags_li_b integer,
+ tags_ol_b integer,
+ tags_p_b integer,
+ tags_table_b integer,
+ tags_td_b integer,
+ tags_title_b integer,
+ tags_tr_b integer,
+ tags_u_b integer,
+ tags_ul_b integer
+ );
+ </sql>
+ <sql>
+ insert into tags_by_mime (mime_id_a, mime_id_b)
+ select ma.mime_id, mb.mime_id
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_a_a=(
+ select sum(ta.tags_a) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_b_a=(
+ select sum(ta.tags_b) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_div_a=(
+ select sum(ta.tags_div) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_i_a=(
+ select sum(ta.tags_i) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_img_a=(
+ select sum(ta.tags_img) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_li_a=(
+ select sum(ta.tags_li) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ol_a=(
+ select sum(ta.tags_ol) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_p_a=(
+ select sum(ta.tags_p) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_table_a=(
+ select sum(ta.tags_table) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_td_a=(
+ select sum(ta.tags_td) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_title_a=(
+ select sum(ta.tags_title) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_tr_a=(
+ select sum(ta.tags_tr) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_u_a=(
+ select sum(ta.tags_u) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ul_a=(
+ select sum(ta.tags_ul) as cnt from tags_a ta
+ join tags_b tb on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <!-- now update tags_b counts -->
+ <sql>
+ update tags_by_mime tbm set tags_a_b=(
+ select sum(tb.tags_a) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_b_b=(
+ select sum(tb.tags_b) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_div_b=(
+ select sum(tb.tags_div) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_i_b=(
+ select sum(tb.tags_i) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_img_b=(
+ select sum(tb.tags_img) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_li_b=(
+ select sum(tb.tags_li) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ol_b=(
+ select sum(tb.tags_ol) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_p_b=(
+ select sum(tb.tags_p) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_table_b=(
+ select sum(tb.tags_table) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_td_b=(
+ select sum(tb.tags_td) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_title_b=(
+ select sum(tb.tags_title) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_tr_b=(
+ select sum(tb.tags_tr) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_u_b=(
+ select sum(tb.tags_u) as cnt from tags_b tb
+ join tags_a ta on tb.id=ta.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tags_by_mime tbm set tags_ul_b=(
+ select sum(tb.tags_ul) as cnt from tags_b tb
+ join tags_a ta on ta.id=tb.id
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tbm.mime_id_b
+ and pa.mime_id=tbm.mime_id_a
+ and ta.tags_parse_exception=false
+ and tb.tags_parse_exception=false
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>drop table if exists tag_exceptions_by_mime</sql>
+ <sql>create table tag_exceptions_by_mime (
+ mime_id_a integer,
+ mime_id_b integer,
+ tag_exceptions_a integer,
+ tag_exceptions_b integer)
+ </sql>
+ <sql>
+ insert into tag_exceptions_by_mime (mime_id_a, mime_id_b,
+ tag_exceptions_a, tag_exceptions_b)
+ select ma.mime_id, mb.mime_id,0,0
+ from profiles_a a
+ join profiles_b b on a.id=b.id
+ join mimes ma on ma.mime_id=a.mime_id
+ join mimes mb on mb.mime_id=b.mime_id
+ group by ma.mime_id, mb.mime_id
+ </sql>
+ <sql>
+ update tag_exceptions_by_mime tebm set tag_exceptions_a=(
+ select count(1) as cnt from tags_a ta
+ join profiles_a pa on pa.id=ta.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tebm.mime_id_b
+ and pa.mime_id=tebm.mime_id_a
+ and ta.tags_parse_exception=true
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
+ <sql>
+ update tag_exceptions_by_mime tebm set tag_exceptions_b=(
+ select count(1) as cnt from tags_b tb
+ join profiles_a pa on pa.id=tb.id
+ join profiles_b pb on pa.id=pb.id
+ where pb.mime_id= tebm.mime_id_b
+ and pa.mime_id=tebm.mime_id_a
+ and tb.tags_parse_exception=true
+ group by mime_id_a, mime_id_b
+ );
+ </sql>
</before>
<!-- MIMES -->
@@ -1018,8 +1465,92 @@
pb.num_metadata_values-pa.num_metadata_values
</sql>
</report>
+ <report reportName="Tag Count Diffs By Mime"
+ reportFilename="tags/tag_count_diffs_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select ma.mime_string as mime_string_a,
+ mb.mime_string as mime_string_b,
+ tags_a_a,
+ tags_a_b,
+ tags_b_a,
+ tags_b_b,
+ tags_div_a,
+ tags_div_b,
+ tags_i_a,
+ tags_i_b,
+ tags_li_a,
+ tags_li_b,
+ tags_ol_a,
+ tags_ol_b,
+ tags_p_a,
+ tags_p_b,
+ tags_table_a,
+ tags_table_b,
+ tags_td_a,
+ tags_td_b,
+ tags_title_a,
+ tags_title_b,
+ tags_tr_a,
+ tags_tr_b,
+ tags_u_a,
+ tags_u_b,
+ tags_ul_a,
+ tags_ul_b
+ from
+ tags_by_mime tbm
+ join mimes ma on tbm.mime_id_a=ma.mime_id
+ join mimes mb on tbm.mime_id_b=mb.mime_id
+ </sql>
- <after>
+ </report>
+ <report reportName="Tag Exceptions By Mime"
+ reportFilename="tags/tag_exceptions_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select ma.mime_string as mime_string_a,
+ mb.mime_string as mime_string_b,
+ tag_exceptions_a,
+ tag_exceptions_b,
+ (tag_exceptions_b-tag_exceptions_a) as diff_tag_exceptions_in_b
+ from tag_exceptions_by_mime tebm
+ join mimes ma on tebm.mime_id_a=ma.mime_id
+ join mimes mb on tebm.mime_id_b=mb.mime_id
+ order by diff_tag_exceptions_in_b desc
+ </sql>
+ </report>
+ <report reportName="Tag Exceptions Details A"
+ reportFilename="tags/tag_exceptions_details_a.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select c.file_path,pa.file_name,mime_string,is_embedded from
+ tags_a ta
+ join profiles_a pa on ta.id=pa.id
+ join containers c on pa.container_id=c.container_id
+ join mimes m on pa.mime_id=m.mime_id
+ where ta.tags_parse_exception=true
+ order by m.mime_string
+ limit 20000
+ </sql>
+ </report>
+ <report reportName="Tag Exceptions Details B"
+ reportFilename="tags/tag_exceptions_details_b.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select c.file_path,pb.file_name,mime_string,is_embedded from
+ tags_b tb
+ join profiles_b pb on tb.id=pb.id
+ join containers c on pb.container_id=c.container_id
+ join mimes m on pb.mime_id=m.mime_id
+ where tb.tags_parse_exception=true
+ order by m.mime_string
+ limit 20000
+ </sql>
+ </report> <after>
<sql>drop table if exists md5_multiples_tmp_a</sql>
<sql>drop table if exists md5_multiples_tmp_b</sql>
</after>
diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml
index d31606f..028a7f4 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -260,6 +260,67 @@
CONTAINER_LENGTH asc
</sql>
</report>
+ <report reportName="TagExceptionsByMime"
+ reportFilename="tags/tag_exceptions_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string, count(1) as CNT
+ from tags t
+ join profiles p on p.id=t.id
+ join mimes m on p.mime_id=m.mime_id
+ where tags_parse_exception=TRUE
+ group by mime_string
+ order by CNT desc
+ </sql>
+ </report>
+ <report reportName="Tag Exceptions Details"
+ reportFilename="tags/tag_exceptions_details.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select c.file_path,p.file_name,mime_string,is_embedded from
+ tags t
+ join profiles p on t.id=p.id
+ join containers c on p.container_id=c.container_id
+ join mimes m on p.mime_id=m.mime_id
+ where t.tags_parse_exception=true
+ order by m.mime_string
+ limit 20000
+ </sql>
+ </report>
+ <report reportName="Tags by Mime"
+ reportFilename="tags/tags_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string,
+ sum(tags_a) as tags_a,
+ sum(tags_b) as tags_b,
+ sum(tags_div) as tags_div,
+ sum(tags_i) as tags_i,
+ sum(tags_img) as tags_img,
+ sum(tags_li) as tags_li,
+ sum(tags_ol) as tags_ol,
+ sum(tags_p) as tags_p,
+ sum(tags_table) as tags_table,
+ sum(tags_td) as tags_td,
+ sum(tags_title) as tags_title,
+ sum(tags_tr) as tags_tr,
+ sum(tags_u) as tags_u,
+ sum(tags_ul) as tags_ul
+
+ from tags t
+ join profiles p on t.id=p.id
+ join mimes m on p.mime_id=m.mime_id
+ where tags_parse_exception=false
+ group by m.mime_id
+ </sql>
+
+ </report>
<after>
<!--<sql>drop index on x</sql>