You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/09/25 20:32:53 UTC
[tika] 02/02: TIKA-2268 -- add more reports and fix div by 0 bug
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 2748538fbb66fa0bd9c0f5f18c87826260dcd227
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Sep 25 16:32:20 2017 -0400
TIKA-2268 -- add more reports and fix div by 0 bug
---
tika-eval/src/main/resources/profile-reports.xml | 110 +++++++++++++++++++++--
1 file changed, 105 insertions(+), 5 deletions(-)
diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml
index 75cdaf8..d31606f 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -112,20 +112,63 @@
</report>
<report reportName="Common Tokens Divided by Alphabetic Tokens"
- reportFilename="content/common_tokens_div_alphabetic.xlsx"
+ reportFilename="content/common_tokens_div_alphabetic_exclude_media_and_zips.xlsx"
format="xlsx"
includeSql="true">
<!-- 0.50 is a complete heuristic -->
<sql>
- select file_path, lang_id_1, common_tokens_lang,
+ select file_path, file_name, is_embedded,
+ mime_string, lang_id_1, common_tokens_lang,
num_tokens, num_alphabetic_tokens, num_common_tokens,
- cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) common_div_alphabetic
+ case
+ when num_alphabetic_tokens > 0
+ then cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal)
+ else 0
+ end as common_div_alphabetic
from contents c
join profiles p on p.id=c.id
join containers ct on ct.container_id=p.container_id
- where cast(num_alphabetic_tokens as decimal) > 0.0
- and cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) < 0.50
+ join mimes m on p.mime_id=m.mime_id
+ where
+ (num_alphabetic_tokens = 0
+ or cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) < 0.50
+ )
+ and mime_string not like 'image%'
+ and mime_string not like 'video%'
+ and mime_string not like 'audio%'
+ and mime_string not like 'application/zip'
order by common_div_alphabetic asc
+ limit 10000
+ </sql>
+ </report>
+
+
+ <!-- MSWord files do not usually store actual # of pages; rather, they store 1 or 0,
+ and the actual number is calculated dynamically by the
+ application when the file is loaded. This will lead to some crazily high
+ tokens/page counts for MSWord files, but the focus of this query is on the low end.
+ -->
+ <report reportName="Tokens Per Page"
+ reportFilename="content/tokens_per_page_in_container_files.xlsx"
+ format="xlsx"
+ includeSql="true">
+ <sql>
+ select file_path, mime_string, num_tokens,
+ num_pages,
+ case
+ when num_tokens = 0
+ then 0
+ else
+ cast(num_tokens as decimal)/cast(num_pages as decimal)
+ end as num_tokens_div_num_pages
+ from profiles p
+ left join contents c on p.id=c.id
+ join mimes m on p.mime_id = m.mime_id
+ join containers ct on p.container_id=ct.container_id
+ where num_pages is not null and num_pages > 0
+ and is_embedded=false
+ order by num_tokens_div_num_pages asc
+ limit 1000
</sql>
</report>
@@ -160,6 +203,63 @@
order by cnt desc;
</sql>
</report>
+
+ <report reportName="AllExceptionsByMimeByType"
+ reportFilename="exceptions/exceptions_by_mime_by_type.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select mime_string as MIME_TYPE,
+ parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+ from parse_exceptions e
+ join profiles p on p.id=e.id
+ join containers c on p.container_id=c.container_id
+ join mimes m on m.mime_id=p.mime_id
+ join ref_parse_exception_types r on
+ r.parse_exception_id=e.parse_exception_id
+ group by p.mime_id, parse_exception_description
+ order by MIME_TYPE, EXCEPTION_TYPE
+ </sql>
+ </report>
+
+ <report reportName="StackTracesByMime"
+ reportFilename="exceptions/stack_traces_by_mime.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
+ COUNT
+ from parse_exceptions e
+ join profiles p on p.id=e.id
+ join mimes m on m.mime_id=p.mime_id
+ and e.parse_exception_id=0
+ group by MIME_TYPE, e.sort_stack_trace
+ order by MIME_TYPE asc, COUNT desc
+ </sql>
+ </report>
+
+ <report reportName="AllStackTraces"
+ reportFilename="exceptions/stack_traces_details.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ file_name, is_embedded,
+ c.length as CONTAINER_LENGTH,
+ mime_string as MIME_TYPE,
+ orig_stack_trace, sort_stack_trace
+ from parse_exceptions e
+ join profiles p on p.id=e.id
+ join containers c on p.container_id=c.container_id
+ join mimes m on m.mime_id=p.mime_id
+ and e.parse_exception_id=0
+ order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
+ CONTAINER_LENGTH asc
+ </sql>
+ </report>
<after>
<!--<sql>drop index on x</sql>
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.