You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/28 14:26:35 UTC

(tika) branch TIKA-4205 created (now 3d4c94308)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4205
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 3d4c94308 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler

This branch includes the following new commits:

     new 3d4c94308 TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



(tika) 01/01: TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4205
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 3d4c94308d5b64ecc7306850b0ec935e615e7c6f
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 28 09:26:21 2024 -0500

    TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler
---
 tika-eval/tika-eval-app/pom.xml                         |  2 --
 .../java/org/apache/tika/eval/app/AbstractProfiler.java | 17 ++++++++++++++++-
 .../java/org/apache/tika/eval/app/ExtractProfiler.java  |  4 ++++
 .../src/main/java/org/apache/tika/eval/app/db/Cols.java |  3 +++
 4 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml
index 18671052c..b93783f75 100644
--- a/tika-eval/tika-eval-app/pom.xml
+++ b/tika-eval/tika-eval-app/pom.xml
@@ -93,10 +93,8 @@
               </createDependencyReducedPom>
               <artifactSet>
                 <excludes>
-                  <exclude>org.apache.tika:tika-core:jar:</exclude>
                   <exclude>org.apache.tika:tika-serialization:jar:</exclude>
                   <exclude>org.apache.tika:tika-langdetect-opennlp:jar:</exclude>
-                  <exclude>commons-io:commons-io:jar:</exclude>
                   <exclude>commons-codec:commons-codec:jar:</exclude>
                   <exclude>org.apache.commons:commons-lang3:jar:</exclude>
                   <exclude>org.apache.commons:commons-math3:jar:</exclude>
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
index 2397bbcab..0cd609d3b 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
@@ -71,6 +71,7 @@ import org.apache.tika.eval.core.util.EvalExceptionUtils;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.language.detect.LanguageResult;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
 import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -387,6 +388,10 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         if (nPages != null) {
             data.put(Cols.NUM_PAGES, Integer.toString(nPages));
         }
+        Integer nOCRPages = m.getInt(PDF.OCR_PAGE_COUNT);
+        if (nOCRPages != null) {
+            data.put(Cols.NUM_OCR_PAGES, Integer.toString(nOCRPages));
+        }
 
         //if the outer wrapper document
         if (i == 0) {
@@ -395,10 +400,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
             data.put(Cols.EMBEDDED_DEPTH, "0");
         } else {
             data.put(Cols.IS_EMBEDDED, TRUE);
-            data.put(Cols.FILE_NAME, getFileName(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)));
+            String embeddedFilePath = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+            if (! StringUtils.isBlank(embeddedFilePath)) {
+                data.put(Cols.FILE_NAME, getFileName(m.get(embeddedFilePath)));
+                data.put(Cols.EMBEDDED_FILE_PATH, embeddedFilePath);
+            }
             if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) {
                 data.put(Cols.EMBEDDED_DEPTH, m.get(TikaCoreProperties.EMBEDDED_DEPTH));
             }
+            if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
+                data.put(Cols.ATTACHMENT_TYPE, m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+            }
         }
         String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
         ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
@@ -486,6 +498,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
                     Integer.toString(commonTokenResult.getUniqueAlphabeticTokens()));
             data.put(Cols.NUM_ALPHABETIC_TOKENS,
                     Integer.toString(commonTokenResult.getAlphabeticTokens()));
+            double oov = commonTokenResult.getAlphabeticTokens() > 0 ? commonTokenResult.getOOV() : -1.0;
+            data.put(Cols.OOV, Double.toString(oov));
         }
         TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class);
         if (tokenCounts != null) {
@@ -498,6 +512,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
                     Double.toString((Double) textStats.get(TokenEntropy.class)));
         }
 
+
         SummaryStatistics summStats = (SummaryStatistics) textStats.get(TokenLengths.class);
         if (summStats != null) {
             data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) summStats.getSum()));
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
index ad0ce0bac..4e7d45088 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -64,12 +64,15 @@ public class ExtractProfiler extends AbstractProfiler {
                     new ColInfo(Cols.MD5, Types.CHAR, 32), new ColInfo(Cols.LENGTH, Types.BIGINT),
                     new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN),
                     new ColInfo(Cols.EMBEDDED_DEPTH, Types.INTEGER),
+                    new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024),
+                    new ColInfo(Cols.ATTACHMENT_TYPE, Types.VARCHAR, 32),
                     new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
                     new ColInfo(Cols.MIME_ID, Types.INTEGER),
                     new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER),
                     new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER),
                     new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER),
                     new ColInfo(Cols.NUM_PAGES, Types.INTEGER),
+                    new ColInfo(Cols.NUM_OCR_PAGES, Types.INTEGER),
                     new ColInfo(Cols.HAS_CONTENT, Types.BOOLEAN));
     public static TableInfo EMBEDDED_FILE_PATH_TABLE =
             new TableInfo("emb_file_names", new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
@@ -84,6 +87,7 @@ public class ExtractProfiler extends AbstractProfiler {
                     new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER),
                     new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER),
                     new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER),
+                    new ColInfo(Cols.OOV, Types.DOUBLE),
                     new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024),
                     new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12),
                     new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT),
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
index b6f617ce6..35d70b430 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
@@ -26,6 +26,7 @@ public enum Cols {
     //profile table
     ID, LENGTH, FILE_NAME, FILE_EXTENSION, ELAPSED_TIME_MILLIS, NUM_METADATA_VALUES, IS_EMBEDDED,
     EMBEDDED_FILE_PATH, MIME_ID, TIKA_MIME_ID, FILE_MIME_ID, SHA256, MD5, NUM_ATTACHMENTS,
+    ATTACHMENT_TYPE,
     EMBEDDED_DEPTH,
     HAS_CONTENT,
 
@@ -34,8 +35,10 @@ public enum Cols {
     NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens
     COMMON_TOKENS_LANG, //which language was used for the common tokens metric?
     NUM_UNIQUE_COMMON_TOKENS, NUM_COMMON_TOKENS, TOP_N_TOKENS, LANG_ID_1, LANG_ID_PROB_1, LANG_ID_2,
+    OOV,
     LANG_ID_PROB_2, TOKEN_ENTROPY_RATE, TOKEN_LENGTH_SUM, TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV,
     UNICODE_CHAR_BLOCKS, NUM_PAGES, //number of pages a document alleges it has
+    NUM_OCR_PAGES,
     CONTENT_TRUNCATED_AT_MAX_LEN, // was the string truncated at AbstractProfiler.MAX_STRING_LENGTH
 
     //content comparisons