You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/09/25 20:32:51 UTC

[tika] branch master updated (c54efd8 -> 2748538)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from c54efd8  TIKA-2470 -- fix...add back namespace aware
     new 5d41096  prevent div by 0 exception in profile-reports.xml
     new 2748538  TIKA-2268 -- add more reports and fix div by 0 bug

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 tika-eval/src/main/resources/profile-reports.xml | 109 ++++++++++++++++++++++-
 1 file changed, 105 insertions(+), 4 deletions(-)

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].

[tika] 01/02: prevent div by 0 exception in profile-reports.xml

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5d410966d3de971adfc7f702e5878d7960433a75
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Sep 25 09:03:41 2017 -0400

    prevent div by 0 exception in profile-reports.xml
---
 tika-eval/src/main/resources/profile-reports.xml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml
index 0a7bb4d..75cdaf8 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -123,7 +123,8 @@
             from contents c
             join profiles p on p.id=c.id
             join containers ct on ct.container_id=p.container_id
-            where cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) &lt; 0.50
+            where cast(num_alphabetic_tokens as decimal) > 0.0
+                and cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) &lt; 0.50
             order by common_div_alphabetic asc
         </sql>
     </report>

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.

[tika] 02/02: TIKA-2268 -- add more reports and fix div by 0 bug

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2748538fbb66fa0bd9c0f5f18c87826260dcd227
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Sep 25 16:32:20 2017 -0400

    TIKA-2268 -- add more reports and fix div by 0 bug
---
 tika-eval/src/main/resources/profile-reports.xml | 110 +++++++++++++++++++++--
 1 file changed, 105 insertions(+), 5 deletions(-)

diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml
index 75cdaf8..d31606f 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -112,20 +112,63 @@
     </report>
 
     <report reportName="Common Tokens Divided by Alphabetic Tokens"
-            reportFilename="content/common_tokens_div_alphabetic.xlsx"
+            reportFilename="content/common_tokens_div_alphabetic_exclude_media_and_zips.xlsx"
             format="xlsx"
             includeSql="true">
         <!-- 0.50 is a complete heuristic -->
         <sql>
-            select file_path, lang_id_1, common_tokens_lang,
+            select file_path, file_name, is_embedded,
+            mime_string, lang_id_1, common_tokens_lang,
             num_tokens, num_alphabetic_tokens, num_common_tokens,
-            cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) common_div_alphabetic
+            case
+                when num_alphabetic_tokens &gt; 0
+                then cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal)
+                else 0
+            end as common_div_alphabetic
             from contents c
             join profiles p on p.id=c.id
             join containers ct on ct.container_id=p.container_id
-            where cast(num_alphabetic_tokens as decimal) > 0.0
-                and cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) &lt; 0.50
+            join mimes m on p.mime_id=m.mime_id
+            where
+                (num_alphabetic_tokens = 0
+                    or cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) &lt; 0.50
+                )
+            and mime_string not like 'image%'
+            and mime_string not like 'video%'
+            and mime_string not like 'audio%'
+            and mime_string not like 'application/zip'
             order by common_div_alphabetic asc
+            limit 10000
+        </sql>
+    </report>
+
+
+    <!-- MSWord files do not usually store actual # of pages; rather, they store 1 or 0,
+         and the actual number is calculated dynamically by the
+         application when the file is loaded.  This will lead to some crazily high
+         tokens/page counts for MSWord files, but the focus of this query is on the low end.
+    -->
+    <report reportName="Tokens Per Page"
+            reportFilename="content/tokens_per_page_in_container_files.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select file_path, mime_string, num_tokens,
+            num_pages,
+            case
+                when num_tokens = 0
+                    then 0
+                else
+                    cast(num_tokens as decimal)/cast(num_pages as decimal)
+            end as num_tokens_div_num_pages
+            from profiles p
+            left join contents c on p.id=c.id
+            join mimes m on p.mime_id = m.mime_id
+            join containers ct on p.container_id=ct.container_id
+            where num_pages is not null and num_pages &gt; 0
+            and is_embedded=false
+            order by num_tokens_div_num_pages asc
+            limit 1000
         </sql>
     </report>
 
@@ -160,6 +203,63 @@
             order by cnt desc;
         </sql>
     </report>
+
+    <report reportName="AllExceptionsByMimeByType"
+            reportFilename="exceptions/exceptions_by_mime_by_type.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string as MIME_TYPE,
+            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+            from parse_exceptions e
+            join profiles p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_id=p.mime_id
+            join ref_parse_exception_types r on
+            r.parse_exception_id=e.parse_exception_id
+            group by p.mime_id, parse_exception_description
+            order by MIME_TYPE, EXCEPTION_TYPE
+        </sql>
+    </report>
+
+    <report reportName="StackTracesByMime"
+            reportFilename="exceptions/stack_traces_by_mime.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
+            COUNT
+            from parse_exceptions e
+            join profiles p on p.id=e.id
+            join mimes m on m.mime_id=p.mime_id
+            and e.parse_exception_id=0
+            group by MIME_TYPE, e.sort_stack_trace
+            order by MIME_TYPE asc, COUNT desc
+        </sql>
+    </report>
+
+    <report reportName="AllStackTraces"
+            reportFilename="exceptions/stack_traces_details.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            file_name, is_embedded,
+            c.length as CONTAINER_LENGTH,
+            mime_string as MIME_TYPE,
+            orig_stack_trace, sort_stack_trace
+            from parse_exceptions e
+            join profiles p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_id=p.mime_id
+            and e.parse_exception_id=0
+            order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
+            CONTAINER_LENGTH asc
+        </sql>
+    </report>
     <after>
 
         <!--<sql>drop index on x</sql>

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.