You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/12/02 20:58:25 UTC

[tika] branch main updated: Merge pull request #834

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new fb7f29915 Merge pull request #834
fb7f29915 is described below

commit fb7f29915b367081c053eb5920550ee90a0afa37
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri Dec 2 15:58:19 2022 -0500

    Merge pull request #834
    
    * TIKA-3942 -- ensure unique embedded paths
---
 CHANGES.txt                                        |  2 ++
 .../apache/tika/metadata/TikaCoreProperties.java   | 22 ++++++++++++++++++++++
 .../apache/tika/parser/RecursiveParserWrapper.java | 20 +++++++++++++++-----
 .../tika/parser/RecursiveParserWrapperTest.java    | 12 ++++++++----
 .../tika/parser/microsoft/rtf/RTFParserTest.java   |  5 +++--
 5 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 630b3c3f0..46ecddedc 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.6.1 - ???
 
+   * Add X-TIKA:embedded_id_path to ensure unique embedded file paths (TIKA-3942).
+
    * Fix bug that prevented digests when the fallback/EmptyParser
      was called (TIKA-3939).
 
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 8ba1834f4..e49144354 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -50,8 +50,30 @@ public interface TikaCoreProperties {
      */
     String TIKA_META_PREFIX = "X-TIKA" + NAMESPACE_PREFIX_DELIMITER;
     Property EMBEDDED_DEPTH = Property.internalInteger(TIKA_META_PREFIX + "embedded_depth");
+
+    /**
+     * This tracks the embedded file paths based on the name of embedded files
+     * where available.  There is a small risk that there may be path collisions
+     * and that these paths may not be unique within a file.
+     *
+     * For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}.
+     */
     Property EMBEDDED_RESOURCE_PATH =
             Property.internalText(TIKA_META_PREFIX + "embedded_resource_path");
+
+    /**
+     * This tracks the embedded file paths based on the embedded file's
+     * {@link TikaCoreProperties#EMBEDDED_ID}.
+     */
+    Property EMBEDDED_ID_PATH =
+            Property.internalText(TIKA_META_PREFIX + "embedded_id_path");
+
+    /**
+     * This is a 1-index counter for embedded files, used by the RecursiveParserWrapper
+     */
+    Property EMBEDDED_ID =
+            Property.internalInteger(TIKA_META_PREFIX + "embedded_id");
+
     Property PARSE_TIME_MILLIS = Property.internalText(TIKA_META_PREFIX + "parse_time_millis");
     /**
      * Simple class name of the content handler
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 79ff4c379..101aa3395 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -137,7 +137,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
                     "ContentHandler must implement RecursiveParserWrapperHandler");
         }
         EmbeddedParserDecorator decorator =
-                new EmbeddedParserDecorator(getWrappedParser(), "/", parserState);
+                new EmbeddedParserDecorator(getWrappedParser(), "/", "/", parserState);
         context.set(Parser.class, decorator);
         ContentHandler localHandler =
                 parserState.recursiveParserWrapperHandler.getNewContentHandler();
@@ -204,13 +204,17 @@ public class RecursiveParserWrapper extends ParserDecorator {
         private final ParserState parserState;
         private String location = null;
 
+        private String embeddedIdPath = null;
 
-        private EmbeddedParserDecorator(Parser parser, String location, ParserState parseState) {
+
+        private EmbeddedParserDecorator(Parser parser, String location,
+                                        String embeddedIdPath, ParserState parseState) {
             super(parser);
             this.location = location;
             if (!this.location.endsWith("/")) {
                 this.location += "/";
             }
+            this.embeddedIdPath = embeddedIdPath;
             this.parserState = parseState;
         }
 
@@ -227,7 +231,12 @@ public class RecursiveParserWrapper extends ParserDecorator {
 
             metadata.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, objectLocation);
 
-
+            String idPath =
+                    this.embeddedIdPath.equals("/") ?
+                            this.embeddedIdPath + ++parserState.embeddedCount :
+                            this.embeddedIdPath + "/" + ++parserState.embeddedCount;
+            metadata.add(TikaCoreProperties.EMBEDDED_ID_PATH, idPath);
+            metadata.set(TikaCoreProperties.EMBEDDED_ID, parserState.embeddedCount);
             //get a fresh handler
             ContentHandler localHandler =
                     parserState.recursiveParserWrapperHandler.getNewContentHandler();
@@ -235,7 +244,8 @@ public class RecursiveParserWrapper extends ParserDecorator {
 
             Parser preContextParser = context.get(Parser.class);
             context.set(Parser.class,
-                    new EmbeddedParserDecorator(getWrappedParser(), objectLocation, parserState));
+                    new EmbeddedParserDecorator(getWrappedParser(), objectLocation,
+                            idPath, parserState));
             long started = System.currentTimeMillis();
             RecursivelySecureContentHandler secureContentHandler =
                     context.get(RecursivelySecureContentHandler.class);
@@ -288,7 +298,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
     private static class ParserState {
         private final AbstractRecursiveParserWrapperHandler recursiveParserWrapperHandler;
         private int unknownCount = 0;
-
+        private int embeddedCount = 0;//this is effectively 1-indexed
         private ParserState(AbstractRecursiveParserWrapperHandler handler) {
             this.recursiveParserWrapperHandler = handler;
         }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 03461d5f3..28b22a29a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -86,7 +86,6 @@ public class RecursiveParserWrapperTest extends TikaTest {
         assertNull(content);
     }
 
-
     @Test
     public void testCharLimit() throws Exception {
         ParseContext context = new ParseContext();
@@ -304,13 +303,18 @@ public class RecursiveParserWrapperTest extends TikaTest {
         List<Metadata> list = getMetadata(metadata,
                 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
                 true, new CommonsDigester(100000, "md5"));
-        int i = 0;
-        Metadata m0 = list.get(0);
-        Metadata m6 = list.get(6);
+
         String md5Key = "X-TIKA:digest:MD5";
         assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
         assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
         assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
+
+        //while we're at it, also test the embedded path id
+        assertEquals("/2/5/8/9", list.get(6).get(TikaCoreProperties.EMBEDDED_ID_PATH));
+        assertEquals("/embed1.zip/embed2.zip/embed3.zip/embed3.txt",
+                list.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+        assertEquals(9, list.get(6).getInt(TikaCoreProperties.EMBEDDED_ID));
+        assertEquals(4, list.get(6).getInt(TikaCoreProperties.EMBEDDED_DEPTH));
     }
 
     @Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 4cfb09bc6..d8ed9f31a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -134,8 +134,9 @@ public class RTFParserTest extends TikaTest {
         assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
 
         //need flexibility for if tesseract is installed or not
-        assertTrue(meta_jpg.names().length >= 50 && meta_jpg.names().length <= 51);
-        assertTrue(meta_jpg_exif.names().length >= 109 && meta_jpg_exif.names().length <= 110);
+        //TODO -- fix this test.  It is too fragile.
+        assertTrue(meta_jpg.names().length >= 52 && meta_jpg.names().length <= 53);
+        assertTrue(meta_jpg_exif.names().length >= 111 && meta_jpg_exif.names().length <= 112);
     }
 
     private static class Pair {