You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/09/25 20:32:53 UTC
[tika] 02/02: TIKA-2268 -- add more reports and fix div by 0 bug

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2748538fbb66fa0bd9c0f5f18c87826260dcd227
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Sep 25 16:32:20 2017 -0400

    TIKA-2268 -- add more reports and fix div by 0 bug
---
 tika-eval/src/main/resources/profile-reports.xml | 110 +++++++++++++++++++++--
 1 file changed, 105 insertions(+), 5 deletions(-)

diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml
index 75cdaf8..d31606f 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -112,20 +112,63 @@
     </report>
 
     <report reportName="Common Tokens Divided by Alphabetic Tokens"
-            reportFilename="content/common_tokens_div_alphabetic.xlsx"
+            reportFilename="content/common_tokens_div_alphabetic_exclude_media_and_zips.xlsx"
             format="xlsx"
             includeSql="true">
         <!-- 0.50 is a complete heuristic -->
         <sql>
-            select file_path, lang_id_1, common_tokens_lang,
+            select file_path, file_name, is_embedded,
+            mime_string, lang_id_1, common_tokens_lang,
             num_tokens, num_alphabetic_tokens, num_common_tokens,
-            cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) common_div_alphabetic
+            case
+                when num_alphabetic_tokens &gt; 0
+                then cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal)
+                else 0
+            end as common_div_alphabetic
             from contents c
             join profiles p on p.id=c.id
             join containers ct on ct.container_id=p.container_id
-            where cast(num_alphabetic_tokens as decimal) > 0.0
-                and cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) &lt; 0.50
+            join mimes m on p.mime_id=m.mime_id
+            where
+                (num_alphabetic_tokens = 0
+                    or cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) &lt; 0.50
+                )
+            and mime_string not like 'image%'
+            and mime_string not like 'video%'
+            and mime_string not like 'audio%'
+            and mime_string not like 'application/zip'
             order by common_div_alphabetic asc
+            limit 10000
+        </sql>
+    </report>
+
+
+    <!-- MSWord files do not usually store actual # of pages; rather, they store 1 or 0,
+         and the actual number is calculated dynamically by the
+         application when the file is loaded.  This will lead to some crazily high
+         tokens/page counts for MSWord files, but the focus of this query is on the low end.
+    -->
+    <report reportName="Tokens Per Page"
+            reportFilename="content/tokens_per_page_in_container_files.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select file_path, mime_string, num_tokens,
+            num_pages,
+            case
+                when num_tokens = 0
+                    then 0
+                else
+                    cast(num_tokens as decimal)/cast(num_pages as decimal)
+            end as num_tokens_div_num_pages
+            from profiles p
+            left join contents c on p.id=c.id
+            join mimes m on p.mime_id = m.mime_id
+            join containers ct on p.container_id=ct.container_id
+            where num_pages is not null and num_pages &gt; 0
+            and is_embedded=false
+            order by num_tokens_div_num_pages asc
+            limit 1000
         </sql>
     </report>
 
@@ -160,6 +203,63 @@
             order by cnt desc;
         </sql>
     </report>
+
+    <report reportName="AllExceptionsByMimeByType"
+            reportFilename="exceptions/exceptions_by_mime_by_type.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string as MIME_TYPE,
+            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+            from parse_exceptions e
+            join profiles p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_id=p.mime_id
+            join ref_parse_exception_types r on
+            r.parse_exception_id=e.parse_exception_id
+            group by p.mime_id, parse_exception_description
+            order by MIME_TYPE, EXCEPTION_TYPE
+        </sql>
+    </report>
+
+    <report reportName="StackTracesByMime"
+            reportFilename="exceptions/stack_traces_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
+            COUNT
+            from parse_exceptions e
+            join profiles p on p.id=e.id
+            join mimes m on m.mime_id=p.mime_id
+            and e.parse_exception_id=0
+            group by MIME_TYPE, e.sort_stack_trace
+            order by MIME_TYPE asc, COUNT desc
+        </sql>
+    </report>
+
+    <report reportName="AllStackTraces"
+            reportFilename="exceptions/stack_traces_details.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            file_name, is_embedded,
+            c.length as CONTAINER_LENGTH,
+            mime_string as MIME_TYPE,
+            orig_stack_trace, sort_stack_trace
+            from parse_exceptions e
+            join profiles p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_id=p.mime_id
+            and e.parse_exception_id=0
+            order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
+            CONTAINER_LENGTH asc
+        </sql>
+    </report>
     <after>
 
         <!--<sql>drop index on x</sql>

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.