You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/27 15:02:49 UTC

[tika] branch main updated: TIKA-3775 -- add embedded depth to profiles tables in tika-eval

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 25b18116c TIKA-3775 -- add embedded depth to profiles tables in tika-eval
25b18116c is described below

commit 25b18116cbffb5eedb9180d25d12473f5e57bea7
Author: tallison <ta...@apache.org>
AuthorDate: Fri May 27 11:02:36 2022 -0400

    TIKA-3775 -- add embedded depth to profiles tables in tika-eval
---
 CHANGES.txt                                                        | 2 ++
 .../src/main/java/org/apache/tika/eval/app/AbstractProfiler.java   | 5 +++++
 .../src/main/java/org/apache/tika/eval/app/ExtractComparer.java    | 1 +
 .../src/main/java/org/apache/tika/eval/app/ExtractProfiler.java    | 1 +
 .../src/main/java/org/apache/tika/eval/app/db/Cols.java            | 1 +
 .../test/resources/test-dirs/extractsA/file14_diffAttachOrder.json | 7 +++++--
 .../test/resources/test-dirs/extractsA/file2_attachANotB.doc.json  | 3 ++-
 .../test/resources/test-dirs/extractsB/file14_diffAttachOrder.json | 7 +++++--
 .../test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json  | 3 ++-
 9 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 3118f1d38..8dc5fdf65 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.4.1 - ???
 
+   * Add embedded depth to profiles tables in tika-eval (TIKA-3775).
+
    * Add stop() method to TikaServerCli so that it can be run
      with Apache Commons Daemon (TIKA-1570).
 
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
index 6d4ac1f2f..2397bbcab 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
@@ -75,6 +75,7 @@ import org.apache.tika.metadata.PagedText;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.sax.ToXMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 
 public abstract class AbstractProfiler extends FileResourceConsumer {
 
@@ -391,9 +392,13 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
         if (i == 0) {
             data.put(Cols.IS_EMBEDDED, FALSE);
             data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString());
+            data.put(Cols.EMBEDDED_DEPTH, "0");
         } else {
             data.put(Cols.IS_EMBEDDED, TRUE);
             data.put(Cols.FILE_NAME, getFileName(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)));
+            if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) {
+                data.put(Cols.EMBEDDED_DEPTH, m.get(TikaCoreProperties.EMBEDDED_DEPTH));
+            }
         }
         String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
         ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
index ee12e4781..dbab7d476 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
@@ -197,6 +197,7 @@ public class ExtractComparer extends AbstractProfiler {
             try {
                 metadataListA = extractReader.loadExtract(fpsA.getExtractFile());
             } catch (ExtractReaderException e) {
+                e.printStackTrace();
                 extractExceptionA = e.getType();
             }
         }
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
index 5c041c6ff..ad0ce0bac 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -63,6 +63,7 @@ public class ExtractProfiler extends AbstractProfiler {
                     new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 256),
                     new ColInfo(Cols.MD5, Types.CHAR, 32), new ColInfo(Cols.LENGTH, Types.BIGINT),
                     new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN),
+                    new ColInfo(Cols.EMBEDDED_DEPTH, Types.INTEGER),
                     new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
                     new ColInfo(Cols.MIME_ID, Types.INTEGER),
                     new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER),
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
index bb704b974..b6f617ce6 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
@@ -26,6 +26,7 @@ public enum Cols {
     //profile table
     ID, LENGTH, FILE_NAME, FILE_EXTENSION, ELAPSED_TIME_MILLIS, NUM_METADATA_VALUES, IS_EMBEDDED,
     EMBEDDED_FILE_PATH, MIME_ID, TIKA_MIME_ID, FILE_MIME_ID, SHA256, MD5, NUM_ATTACHMENTS,
+    EMBEDDED_DEPTH,
     HAS_CONTENT,
 
     //content
diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
index 7801ab84b..2888c2576 100644
--- a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
+++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
@@ -8,12 +8,15 @@
     "Content-Type": "text/plain",
     "X-TIKA:embedded_resource_path": "/0",
     "X-TIKA:content": "a b c d e f g h i j k l m n",
-    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354"
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354",
+    "X-TIKA:embedded_depth": "1"
+
   },
   {
     "Content-Type": "text/plain",
     "X-TIKA:embedded_resource_path": "/1",
     "X-TIKA:content": "o p q r s t u v w x y z",
-    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353"
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353",
+    "X-TIKA:embedded_depth": "1"
   }
 ]
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
index 5371c87ed..9120bb393 100644
--- a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
+++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
@@ -5,6 +5,7 @@
   {
     "Content-Type":"text/plain",
     "X-TIKA:embedded_resource_path":"inner.txt",
-    "X-TIKA:content":"attachment contents"
+    "X-TIKA:content":"attachment contents",
+    "X-TIKA:embedded_depth": "1"
   }
 ]
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
index e28bace31..9223769f7 100644
--- a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
+++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
@@ -8,12 +8,15 @@
     "Content-Type": "text/plain",
     "X-TIKA:embedded_resource_path": "inner2.txt",
     "X-TIKA:content": "o p q r s t u v w x y z",
-    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353"
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353",
+    "X-TIKA:embedded_depth": "1"
+
   },
   {
     "Content-Type": "text/plain",
     "X-TIKA:embedded_resource_path": "inner1.txt",
     "X-TIKA:content": "a b c d e f g h i j k l m n",
-    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354"
+    "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354",
+    "X-TIKA:embedded_depth": "1"
   }
 ]
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json
index 5371c87ed..9120bb393 100644
--- a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json
+++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json
@@ -5,6 +5,7 @@
   {
     "Content-Type":"text/plain",
     "X-TIKA:embedded_resource_path":"inner.txt",
-    "X-TIKA:content":"attachment contents"
+    "X-TIKA:content":"attachment contents",
+    "X-TIKA:embedded_depth": "1"
   }
 ]
\ No newline at end of file