You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/12/02 19:57:22 UTC

[tika] branch TIKA-3942 created (now d2c435311)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-3942
in repository https://gitbox.apache.org/repos/asf/tika.git


      at d2c435311 TIKA-3942 -- ensure unique embedded paths

This branch includes the following new commits:

     new d2c435311 TIKA-3942 -- ensure unique embedded paths

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-3942 -- ensure unique embedded paths

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3942
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d2c435311a57a9ac406a26d14cafcf685aba874b
Author: tballison <ta...@apache.org>
AuthorDate: Fri Dec 2 14:57:12 2022 -0500

    TIKA-3942 -- ensure unique embedded paths
---
 CHANGES.txt                                        |  2 ++
 .../apache/tika/metadata/TikaCoreProperties.java   | 22 ++++++++++++++++++++++
 .../apache/tika/parser/RecursiveParserWrapper.java | 20 +++++++++++++++-----
 .../tika/parser/RecursiveParserWrapperTest.java    | 12 ++++++++----
 .../tika/parser/microsoft/rtf/RTFParserTest.java   |  5 +++--
 5 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 630b3c3f0..46ecddedc 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 2.6.1 - ???
 
+   * Add X-TIKA:embedded_id_path to ensure unique embedded file paths (TIKA-3942).
+
    * Fix bug that prevented digests when the fallback/EmptyParser
      was called (TIKA-3939).
 
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 8ba1834f4..e49144354 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -50,8 +50,30 @@ public interface TikaCoreProperties {
      */
     String TIKA_META_PREFIX = "X-TIKA" + NAMESPACE_PREFIX_DELIMITER;
     Property EMBEDDED_DEPTH = Property.internalInteger(TIKA_META_PREFIX + "embedded_depth");
+
+    /**
+     * This tracks the embedded file paths based on the name of embedded files
+     * where available.  There is a small risk that there may be path collisions
+     * and that these paths may not be unique within a file.
+     *
+     * For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}.
+     */
     Property EMBEDDED_RESOURCE_PATH =
             Property.internalText(TIKA_META_PREFIX + "embedded_resource_path");
+
+    /**
+     * This tracks the embedded file paths based on the embedded file's
+     * {@link TikaCoreProperties#EMBEDDED_ID}.
+     */
+    Property EMBEDDED_ID_PATH =
+            Property.internalText(TIKA_META_PREFIX + "embedded_id_path");
+
+    /**
+     * This is a 1-index counter for embedded files, used by the RecursiveParserWrapper
+     */
+    Property EMBEDDED_ID =
+            Property.internalInteger(TIKA_META_PREFIX + "embedded_id");
+
     Property PARSE_TIME_MILLIS = Property.internalText(TIKA_META_PREFIX + "parse_time_millis");
     /**
      * Simple class name of the content handler
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 79ff4c379..101aa3395 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -137,7 +137,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
                     "ContentHandler must implement RecursiveParserWrapperHandler");
         }
         EmbeddedParserDecorator decorator =
-                new EmbeddedParserDecorator(getWrappedParser(), "/", parserState);
+                new EmbeddedParserDecorator(getWrappedParser(), "/", "/", parserState);
         context.set(Parser.class, decorator);
         ContentHandler localHandler =
                 parserState.recursiveParserWrapperHandler.getNewContentHandler();
@@ -204,13 +204,17 @@ public class RecursiveParserWrapper extends ParserDecorator {
         private final ParserState parserState;
         private String location = null;
 
+        private String embeddedIdPath = null;
 
-        private EmbeddedParserDecorator(Parser parser, String location, ParserState parseState) {
+
+        private EmbeddedParserDecorator(Parser parser, String location,
+                                        String embeddedIdPath, ParserState parseState) {
             super(parser);
             this.location = location;
             if (!this.location.endsWith("/")) {
                 this.location += "/";
             }
+            this.embeddedIdPath = embeddedIdPath;
             this.parserState = parseState;
         }
 
@@ -227,7 +231,12 @@ public class RecursiveParserWrapper extends ParserDecorator {
 
             metadata.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, objectLocation);
 
-
+            String idPath =
+                    this.embeddedIdPath.equals("/") ?
+                            this.embeddedIdPath + ++parserState.embeddedCount :
+                            this.embeddedIdPath + "/" + ++parserState.embeddedCount;
+            metadata.add(TikaCoreProperties.EMBEDDED_ID_PATH, idPath);
+            metadata.set(TikaCoreProperties.EMBEDDED_ID, parserState.embeddedCount);
             //get a fresh handler
             ContentHandler localHandler =
                     parserState.recursiveParserWrapperHandler.getNewContentHandler();
@@ -235,7 +244,8 @@ public class RecursiveParserWrapper extends ParserDecorator {
 
             Parser preContextParser = context.get(Parser.class);
             context.set(Parser.class,
-                    new EmbeddedParserDecorator(getWrappedParser(), objectLocation, parserState));
+                    new EmbeddedParserDecorator(getWrappedParser(), objectLocation,
+                            idPath, parserState));
             long started = System.currentTimeMillis();
             RecursivelySecureContentHandler secureContentHandler =
                     context.get(RecursivelySecureContentHandler.class);
@@ -288,7 +298,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
     private static class ParserState {
         private final AbstractRecursiveParserWrapperHandler recursiveParserWrapperHandler;
         private int unknownCount = 0;
-
+        private int embeddedCount = 0;//this is effectively 1-indexed
         private ParserState(AbstractRecursiveParserWrapperHandler handler) {
             this.recursiveParserWrapperHandler = handler;
         }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 03461d5f3..28b22a29a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -86,7 +86,6 @@ public class RecursiveParserWrapperTest extends TikaTest {
         assertNull(content);
     }
 
-
     @Test
     public void testCharLimit() throws Exception {
         ParseContext context = new ParseContext();
@@ -304,13 +303,18 @@ public class RecursiveParserWrapperTest extends TikaTest {
         List<Metadata> list = getMetadata(metadata,
                 new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
                 true, new CommonsDigester(100000, "md5"));
-        int i = 0;
-        Metadata m0 = list.get(0);
-        Metadata m6 = list.get(6);
+
         String md5Key = "X-TIKA:digest:MD5";
         assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
         assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
         assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
+
+        //while we're at it, also test the embedded path id
+        assertEquals("/2/5/8/9", list.get(6).get(TikaCoreProperties.EMBEDDED_ID_PATH));
+        assertEquals("/embed1.zip/embed2.zip/embed3.zip/embed3.txt",
+                list.get(6).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+        assertEquals(9, list.get(6).getInt(TikaCoreProperties.EMBEDDED_ID));
+        assertEquals(4, list.get(6).getInt(TikaCoreProperties.EMBEDDED_DEPTH));
     }
 
     @Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 4cfb09bc6..d8ed9f31a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -134,8 +134,9 @@ public class RTFParserTest extends TikaTest {
         assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
 
         //need flexibility for if tesseract is installed or not
-        assertTrue(meta_jpg.names().length >= 50 && meta_jpg.names().length <= 51);
-        assertTrue(meta_jpg_exif.names().length >= 109 && meta_jpg_exif.names().length <= 110);
+        //TODO -- fix this test.  It is too fragile.
+        assertTrue(meta_jpg.names().length >= 52 && meta_jpg.names().length <= 53);
+        assertTrue(meta_jpg_exif.names().length >= 111 && meta_jpg_exif.names().length <= 112);
     }
 
     private static class Pair {