You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/12/02 20:58:25 UTC
[tika] branch main updated: Merge pull request #834
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new fb7f29915 Merge pull request #834
fb7f29915 is described below
commit fb7f29915b367081c053eb5920550ee90a0afa37
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri Dec 2 15:58:19 2022 -0500
Merge pull request #834
* TIKA-3942 -- ensure unique embedded paths
---
CHANGES.txt | 2 ++
.../apache/tika/metadata/TikaCoreProperties.java | 22 ++++++++++++++++++++++
.../apache/tika/parser/RecursiveParserWrapper.java | 20 +++++++++++++++-----
.../tika/parser/RecursiveParserWrapperTest.java | 12 ++++++++----
.../tika/parser/microsoft/rtf/RTFParserTest.java | 5 +++--
5 files changed, 50 insertions(+), 11 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 630b3c3f0..46ecddedc 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 2.6.1 - ???
+ * Add X-TIKA:embedded_id_path to ensure unique embedded file paths (TIKA-3942).
+
* Fix bug that prevented digests when the fallback/EmptyParser
was called (TIKA-3939).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 8ba1834f4..e49144354 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -50,8 +50,30 @@ public interface TikaCoreProperties {
*/
String TIKA_META_PREFIX = "X-TIKA" + NAMESPACE_PREFIX_DELIMITER;
Property EMBEDDED_DEPTH = Property.internalInteger(TIKA_META_PREFIX + "embedded_depth");
+
+ /**
+ * This tracks the embedded file paths based on the name of embedded files
+ * where available. There is a small risk that there may be path collisions
+ * and that these paths may not be unique within a file.
+ *
+ * For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}.
+ */
Property EMBEDDED_RESOURCE_PATH =
Property.internalText(TIKA_META_PREFIX + "embedded_resource_path");
+
+ /**
+ * This tracks the embedded file paths based on the embedded file's
+ * {@link TikaCoreProperties#EMBEDDED_ID}.
+ */
+ Property EMBEDDED_ID_PATH =
+ Property.internalText(TIKA_META_PREFIX + "embedded_id_path");
+
+ /**
+ * This is a 1-index counter for embedded files, used by the RecursiveParserWrapper
+ */
+ Property EMBEDDED_ID =
+ Property.internalInteger(TIKA_META_PREFIX + "embedded_id");
+
Property PARSE_TIME_MILLIS = Property.internalText(TIKA_META_PREFIX + "parse_time_millis");
/**
* Simple class name of the content handler
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 79ff4c379..101aa3395 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -137,7 +137,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
"ContentHandler must implement RecursiveParserWrapperHandler");
}
EmbeddedParserDecorator decorator =
- new EmbeddedParserDecorator(getWrappedParser(), "/", parserState);
+ new EmbeddedParserDecorator(getWrappedParser(), "/", "/", parserState);
context.set(Parser.class, decorator);
ContentHandler localHandler =
parserState.recursiveParserWrapperHandler.getNewContentHandler();
@@ -204,13 +204,17 @@ public class RecursiveParserWrapper extends ParserDecorator {
private final ParserState parserState;
private String location = null;
+ private String embeddedIdPath = null;
- private EmbeddedParserDecorator(Parser parser, String location, ParserState parseState) {
+
+ private EmbeddedParserDecorator(Parser parser, String location,
+ String embeddedIdPath, ParserState parseState) {
super(parser);
this.location = location;
if (!this.location.endsWith("/")) {
this.location += "/";
}
+ this.embeddedIdPath = embeddedIdPath;
this.parserState = parseState;
}
@@ -227,7 +231,12 @@ public class RecursiveParserWrapper extends ParserDecorator {
metadata.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, objectLocation);
-
+ String idPath =
+ this.embeddedIdPath.equals("/") ?
+ this.embeddedIdPath + ++parserState.embeddedCount :
+ this.embeddedIdPath + "/" + ++parserState.embeddedCount;
+ metadata.add(TikaCoreProperties.EMBEDDED_ID_PATH, idPath);
+ metadata.set(TikaCoreProperties.EMBEDDED_ID, parserState.embeddedCount);
//get a fresh handler
ContentHandler localHandler =
parserState.recursiveParserWrapperHandler.getNewContentHandler();
@@ -235,7 +244,8 @@ public class RecursiveParserWrapper extends ParserDecorator {
Parser preContextParser = context.get(Parser.class);
context.set(Parser.class,
- new EmbeddedParserDecorator(getWrappedParser(), objectLocation, parserState));
+ new EmbeddedParserDecorator(getWrappedParser(), objectLocation,
+ idPath, parserState));
long started = System.currentTimeMillis();
RecursivelySecureContentHandler secureContentHandler =
context.get(RecursivelySecureContentHandler.class);
@@ -288,7 +298,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
private static class ParserState {
private final AbstractRecursiveParserWrapperHandler recursiveParserWrapperHandler;
private int unknownCount = 0;
-
+ private int embeddedCount = 0;//this is effectively 1-indexed
private ParserState(AbstractRecursiveParserWrapperHandler handler) {
this.recursiveParserWrapperHandler = handler;
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 03461d5f3..28b22a29a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -86,7 +86,6 @@ public class RecursiveParserWrapperTest extends TikaTest {
assertNull(content);
}
-
@Test
public void testCharLimit() throws Exception {
ParseContext context = new ParseContext();
@@ -304,13 +303,18 @@ public class RecursiveParserWrapperTest extends TikaTest {
List<Metadata> list = getMetadata(metadata,
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
true, new CommonsDigester(100000, "md5"));
- int i = 0;
- Metadata m0 = list.get(0);
- Metadata m6 = list.get(6);
+
String md5Key = "X-TIKA:digest:MD5";
assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
+
+ //while we're at it, also test the embedded path id
+ assertEquals("/2/5/8/9", list.get(6).get(TikaCoreProperties.EMBEDDED_ID_PATH));
+ assertEquals("/embed1.zip/embed2.zip/embed3.zip/embed3.txt",
+ list.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ assertEquals(9, list.get(6).getInt(TikaCoreProperties.EMBEDDED_ID));
+ assertEquals(4, list.get(6).getInt(TikaCoreProperties.EMBEDDED_DEPTH));
}
@Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 4cfb09bc6..d8ed9f31a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -134,8 +134,9 @@ public class RTFParserTest extends TikaTest {
assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
//need flexibility for if tesseract is installed or not
- assertTrue(meta_jpg.names().length >= 50 && meta_jpg.names().length <= 51);
- assertTrue(meta_jpg_exif.names().length >= 109 && meta_jpg_exif.names().length <= 110);
+ //TODO -- fix this test. It is too fragile.
+ assertTrue(meta_jpg.names().length >= 52 && meta_jpg.names().length <= 53);
+ assertTrue(meta_jpg_exif.names().length >= 111 && meta_jpg_exif.names().length <= 112);
}
private static class Pair {