You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/27 15:02:49 UTC
[tika] branch main updated: TIKA-3775 -- add embedded depth to profiles tables in tika-eval
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 25b18116c TIKA-3775 -- add embedded depth to profiles tables in tika-eval
25b18116c is described below
commit 25b18116cbffb5eedb9180d25d12473f5e57bea7
Author: tallison <ta...@apache.org>
AuthorDate: Fri May 27 11:02:36 2022 -0400
TIKA-3775 -- add embedded depth to profiles tables in tika-eval
---
CHANGES.txt | 2 ++
.../src/main/java/org/apache/tika/eval/app/AbstractProfiler.java | 5 +++++
.../src/main/java/org/apache/tika/eval/app/ExtractComparer.java | 1 +
.../src/main/java/org/apache/tika/eval/app/ExtractProfiler.java | 1 +
.../src/main/java/org/apache/tika/eval/app/db/Cols.java | 1 +
.../test/resources/test-dirs/extractsA/file14_diffAttachOrder.json | 7 +++++--
.../test/resources/test-dirs/extractsA/file2_attachANotB.doc.json | 3 ++-
.../test/resources/test-dirs/extractsB/file14_diffAttachOrder.json | 7 +++++--
.../test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json | 3 ++-
9 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 3118f1d38..8dc5fdf65 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 2.4.1 - ???
+ * Add embedded depth to profiles tables in tika-eval (TIKA-3775).
+
* Add stop() method to TikaServerCli so that it can be run
with Apache Commons Daemon (TIKA-1570).
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
index 6d4ac1f2f..2397bbcab 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
@@ -75,6 +75,7 @@ import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.ToXMLContentHandler;
+import org.apache.tika.utils.StringUtils;
public abstract class AbstractProfiler extends FileResourceConsumer {
@@ -391,9 +392,13 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
if (i == 0) {
data.put(Cols.IS_EMBEDDED, FALSE);
data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString());
+ data.put(Cols.EMBEDDED_DEPTH, "0");
} else {
data.put(Cols.IS_EMBEDDED, TRUE);
data.put(Cols.FILE_NAME, getFileName(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)));
+ if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) {
+ data.put(Cols.EMBEDDED_DEPTH, m.get(TikaCoreProperties.EMBEDDED_DEPTH));
+ }
}
String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
index ee12e4781..dbab7d476 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractComparer.java
@@ -197,6 +197,7 @@ public class ExtractComparer extends AbstractProfiler {
try {
metadataListA = extractReader.loadExtract(fpsA.getExtractFile());
} catch (ExtractReaderException e) {
+ e.printStackTrace();
extractExceptionA = e.getType();
}
}
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
index 5c041c6ff..ad0ce0bac 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -63,6 +63,7 @@ public class ExtractProfiler extends AbstractProfiler {
new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 256),
new ColInfo(Cols.MD5, Types.CHAR, 32), new ColInfo(Cols.LENGTH, Types.BIGINT),
new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN),
+ new ColInfo(Cols.EMBEDDED_DEPTH, Types.INTEGER),
new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
new ColInfo(Cols.MIME_ID, Types.INTEGER),
new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER),
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
index bb704b974..b6f617ce6 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
@@ -26,6 +26,7 @@ public enum Cols {
//profile table
ID, LENGTH, FILE_NAME, FILE_EXTENSION, ELAPSED_TIME_MILLIS, NUM_METADATA_VALUES, IS_EMBEDDED,
EMBEDDED_FILE_PATH, MIME_ID, TIKA_MIME_ID, FILE_MIME_ID, SHA256, MD5, NUM_ATTACHMENTS,
+ EMBEDDED_DEPTH,
HAS_CONTENT,
//content
diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
index 7801ab84b..2888c2576 100644
--- a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
+++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file14_diffAttachOrder.json
@@ -8,12 +8,15 @@
"Content-Type": "text/plain",
"X-TIKA:embedded_resource_path": "/0",
"X-TIKA:content": "a b c d e f g h i j k l m n",
- "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354"
+ "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354",
+ "X-TIKA:embedded_depth": "1"
+
},
{
"Content-Type": "text/plain",
"X-TIKA:embedded_resource_path": "/1",
"X-TIKA:content": "o p q r s t u v w x y z",
- "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353"
+ "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353",
+ "X-TIKA:embedded_depth": "1"
}
]
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
index 5371c87ed..9120bb393 100644
--- a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
+++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
@@ -5,6 +5,7 @@
{
"Content-Type":"text/plain",
"X-TIKA:embedded_resource_path":"inner.txt",
- "X-TIKA:content":"attachment contents"
+ "X-TIKA:content":"attachment contents",
+ "X-TIKA:embedded_depth": "1"
}
]
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
index e28bace31..9223769f7 100644
--- a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
+++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file14_diffAttachOrder.json
@@ -8,12 +8,15 @@
"Content-Type": "text/plain",
"X-TIKA:embedded_resource_path": "inner2.txt",
"X-TIKA:content": "o p q r s t u v w x y z",
- "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353"
+ "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8353",
+ "X-TIKA:embedded_depth": "1"
+
},
{
"Content-Type": "text/plain",
"X-TIKA:embedded_resource_path": "inner1.txt",
"X-TIKA:content": "a b c d e f g h i j k l m n",
- "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354"
+ "X-TIKA:digest:MD5": "471d98383e9f40444e5ecf821f2c8354",
+ "X-TIKA:embedded_depth": "1"
}
]
\ No newline at end of file
diff --git a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json
index 5371c87ed..9120bb393 100644
--- a/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json
+++ b/tika-eval/tika-eval-app/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json
@@ -5,6 +5,7 @@
{
"Content-Type":"text/plain",
"X-TIKA:embedded_resource_path":"inner.txt",
- "X-TIKA:content":"attachment contents"
+ "X-TIKA:content":"attachment contents",
+ "X-TIKA:embedded_depth": "1"
}
]
\ No newline at end of file