You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/09 13:02:38 UTC

[tika] branch TIKA-4033 created (now 532b47b93)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4033
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 532b47b93 TIKA-4033 -- improve metadata handling for incremental updates.  Generalize incremental updates to "version" and avoid use of synthetic name for resourceName.

This branch includes the following new commits:

     new 532b47b93 TIKA-4033 -- improve metadata handling for incremental updates.  Generalize incremental updates to "version" and avoid use of synthetic name for resourceName.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4033 -- improve metadata handling for incremental updates. Generalize incremental updates to "version" and avoid use of synthetic name for resourceName.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4033
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 532b47b93cf8816e4d884059d812a28c0866f257
Author: tallison <ta...@apache.org>
AuthorDate: Tue May 9 09:02:20 2023 -0400

    TIKA-4033 -- improve metadata handling for incremental updates.  Generalize incremental updates to "version" and avoid use of synthetic name for resourceName.
---
 .../src/main/java/org/apache/tika/metadata/PDF.java  | 20 ++++++++++++--------
 .../org/apache/tika/metadata/TikaCoreProperties.java | 14 ++++++++++++++
 .../apache/tika/parser/RecursiveParserWrapper.java   |  2 ++
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java    |  1 -
 .../java/org/apache/tika/parser/pdf/PDFParser.java   |  2 +-
 .../tika/parser/pdf/PDFIncrementalUpdatesTest.java   | 16 +++++++++++-----
 .../org/apache/tika/parser/pdf/PDFParserTest.java    | 12 ++++++++----
 7 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 0739855fc..c2baca0e8 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -28,12 +28,6 @@ public interface PDF {
     String PDFAID_PREFIX = "pdfaid" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
 
 
-    /**
-     * Incremental updates as extracted by the StartXRefScanner.  See
-     * that class for limitations.
-     */
-    Property PDF_INCREMENTAL_UPDATES = Property.externalInteger(PDF_PREFIX + "incrementalUpdates");
-
     /**
      * Number of %%EOF as extracted by the StartXRefScanner. See
      * that class for limitations.
@@ -203,6 +197,16 @@ public interface PDF {
      * This value is populated with the parse incremental updates feature is selected
      * in the PDFParser.
      */
-    Property INCREMENTAL_UPDATE_NUMBER = Property.internalInteger(PDF_PREFIX +
-            "incrementalUpdateNumber");
+    Property INCREMENTAL_UPDATE_NUMBER =
+            Property.composite(Property.internalInteger(PDF_PREFIX + "incrementalUpdateNumber"),
+                new Property[]{ TikaCoreProperties.VERSION_NUMBER });
+
+    /**
+     * Incremental updates as extracted by the StartXRefScanner.  See
+     * that class for limitations.
+     */
+    Property PDF_INCREMENTAL_UPDATE_COUNT =
+            Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"),
+                    new Property[]{ TikaCoreProperties.VERSION_COUNT });
+
 }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 95cf0a035..b49068c39 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -322,6 +322,20 @@ public interface TikaCoreProperties {
     //is the file encrypted
     Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted");
 
+    /**
+     * General metadata key for the count of non-final versions available within a file.  This
+     * was added initially to support generalizing incremental updates in PDF.
+     */
+    Property VERSION_COUNT = Property.externalInteger(TIKA_META_PREFIX + "versionCount");
+
+    /**
+     * General metadata key for the version number of a given file that contains
+     * earlier versions within it.  This number is 0-indexed for the earliest version.
+     * The latest version does not have this metadata value.  This was added initially
+     * to support generalizing incremental updates in PDF.
+     */
+    Property VERSION_NUMBER = Property.externalInteger(TIKA_META_PREFIX + "versionNumber");
+
     /**
      * A file might contain different types of embedded documents.
      * The most common is the ATTACHMENT.
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 101aa3395..483181b0a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -188,6 +188,8 @@ public class RecursiveParserWrapper extends ParserDecorator {
             objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
         } else if (metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID) != null) {
             objectName = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
+        } else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) {
+            objectName = "version-number-" + metadata.get(TikaCoreProperties.VERSION_NUMBER);
         } else {
             objectName = "embedded-" + (++state.unknownCount);
         }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 5b2d4a659..3bd1a90a8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -1100,7 +1100,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             updateMetadata.set(PDF.INCREMENTAL_UPDATE_NUMBER, count);
             updateMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                     TikaCoreProperties.EmbeddedResourceType.VERSION.toString());
-            updateMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "incremental-update-" + count);
             if (embeddedDocumentExtractor.shouldParseEmbedded(updateMetadata)) {
                 try (InputStream tis = TikaInputStream.get(update)) {
                     context.set(IsIncrementalUpdate.class, IsIncrementalUpdate.IS_INCREMENTAL_UPDATE);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 5cd074c21..07c4e2d9e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -279,7 +279,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
             //don't count the last xref as an incremental update
             startXrefs--;
         }
-        metadata.set(PDF.PDF_INCREMENTAL_UPDATES, startXrefs);
+        metadata.set(PDF.PDF_INCREMENTAL_UPDATE_COUNT, startXrefs);
         if (localConfig.isParseIncrementalUpdates()) {
             try {
                 parseContext.set(IncrementalUpdateRecord.class,
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
index 86b4b6081..7b6c88626 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
@@ -67,7 +67,8 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
         List<Metadata> metadataList = getRecursiveMetadata(
                 "testPDF_incrementalUpdates.pdf",
                 parseContext);
-        assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+        assertEquals(2, metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT));
+        assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
         long[] expected = new long[]{16242, 41226, 64872};
         long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS);
         assertEquals(3, eofs.length);
@@ -155,7 +156,8 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
                 "testPDF_incrementalUpdates.pdf",
                 parseContext);
         assertEquals(3, metadataList.size());
-        assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+        assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
+        assertEquals(2, metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT));
         long[] expected = new long[]{16242, 41226, 64872};
         long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS);
         assertEquals(3, eofs.length);
@@ -170,9 +172,13 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
         assertNull(metadataList.get(0).get(PDF.INCREMENTAL_UPDATE_NUMBER));
         assertEquals(0, metadataList.get(1).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
         assertEquals(1, metadataList.get(2).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
-        assertEquals("incremental-update-0", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
-        assertEquals("incremental-update-1",
-                metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertEquals("/version-number-0",
+                metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+        assertEquals("/version-number-1",
+                metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+
+        assertNull(metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertNull(metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
 
         assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
                 metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ab0340afe..78f54c4f1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -477,7 +477,9 @@ public class PDFParserTest extends TikaTest {
         parseContext.set(PDFParserConfig.class, pdfParserConfig);
         List<Metadata> metadataList = getRecursiveMetadata("test-incremental-updates.eml", parseContext);
         assertEquals(4, metadataList.size());
-        assertEquals(2, metadataList.get(3).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+        assertEquals(2, metadataList.get(3).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
+        assertEquals(2,
+                metadataList.get(3).getInt(TikaCoreProperties.VERSION_COUNT));
         long[] expected = new long[]{16242, 41226, 64872};
         long[] eofs = metadataList.get(3).getLongValues(PDF.EOF_OFFSETS);
         assertEquals(3, eofs.length);
@@ -494,9 +496,11 @@ public class PDFParserTest extends TikaTest {
         assertNull(metadataList.get(3).get(PDF.INCREMENTAL_UPDATE_NUMBER));
         assertEquals(0, metadataList.get(1).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
         assertEquals(1, metadataList.get(2).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
-        assertEquals("incremental-update-0", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
-        assertEquals("incremental-update-1",
-                metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+        assertEquals("/testPDF_incrementalUpdates.pdf/version-number-0",
+                metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+        assertEquals("/testPDF_incrementalUpdates.pdf/version-number-1",
+                metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
 
         assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
                 metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));