You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/09 13:02:39 UTC

[tika] 01/01: TIKA-4033 -- improve metadata handling for incremental updates. Generalize incremental updates to "version" and avoid use of synthetic name for resourceName.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4033
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 532b47b93cf8816e4d884059d812a28c0866f257
Author: tallison <ta...@apache.org>
AuthorDate: Tue May 9 09:02:20 2023 -0400

    TIKA-4033 -- improve metadata handling for incremental updates.  Generalize incremental updates to "version" and avoid use of synthetic name for resourceName.
---
 .../src/main/java/org/apache/tika/metadata/PDF.java  | 20 ++++++++++++--------
 .../org/apache/tika/metadata/TikaCoreProperties.java | 14 ++++++++++++++
 .../apache/tika/parser/RecursiveParserWrapper.java   |  2 ++
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java    |  1 -
 .../java/org/apache/tika/parser/pdf/PDFParser.java   |  2 +-
 .../tika/parser/pdf/PDFIncrementalUpdatesTest.java   | 16 +++++++++++-----
 .../org/apache/tika/parser/pdf/PDFParserTest.java    | 12 ++++++++----
 7 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 0739855fc..c2baca0e8 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -28,12 +28,6 @@ public interface PDF {
     String PDFAID_PREFIX = "pdfaid" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
 
 
-    /**
-     * Incremental updates as extracted by the StartXRefScanner.  See
-     * that class for limitations.
-     */
-    Property PDF_INCREMENTAL_UPDATES = Property.externalInteger(PDF_PREFIX + "incrementalUpdates");
-
     /**
      * Number of %%EOF as extracted by the StartXRefScanner. See
      * that class for limitations.
@@ -203,6 +197,16 @@ public interface PDF {
      * This value is populated with the parse incremental updates feature is selected
      * in the PDFParser.
      */
-    Property INCREMENTAL_UPDATE_NUMBER = Property.internalInteger(PDF_PREFIX +
-            "incrementalUpdateNumber");
+    Property INCREMENTAL_UPDATE_NUMBER =
+            Property.composite(Property.internalInteger(PDF_PREFIX + "incrementalUpdateNumber"),
+                new Property[]{ TikaCoreProperties.VERSION_NUMBER });
+
+    /**
+     * Incremental updates as extracted by the StartXRefScanner.  See
+     * that class for limitations.
+     */
+    Property PDF_INCREMENTAL_UPDATE_COUNT =
+            Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"),
+                    new Property[]{ TikaCoreProperties.VERSION_COUNT });
+
 }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 95cf0a035..b49068c39 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -322,6 +322,20 @@ public interface TikaCoreProperties {
     //is the file encrypted
     Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted");
 
+    /**
+     * General metadata key for the count of non-final versions available within a file.  This
+     * was added initially to support generalizing incremental updates in PDF.
+     */
+    Property VERSION_COUNT = Property.externalInteger(TIKA_META_PREFIX + "versionCount");
+
+    /**
+     * General metadata key for the version number of a given file that contains
+     * earlier versions within it.  This number is 0-indexed for the earliest version.
+     * The latest version does not have this metadata value.  This was added initially
+     * to support generalizing incremental updates in PDF.
+     */
+    Property VERSION_NUMBER = Property.externalInteger(TIKA_META_PREFIX + "versionNumber");
+
     /**
      * A file might contain different types of embedded documents.
      * The most common is the ATTACHMENT.
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 101aa3395..483181b0a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -188,6 +188,8 @@ public class RecursiveParserWrapper extends ParserDecorator {
             objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
         } else if (metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID) != null) {
             objectName = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
+        } else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) {
+            objectName = "version-number-" + metadata.get(TikaCoreProperties.VERSION_NUMBER);
         } else {
             objectName = "embedded-" + (++state.unknownCount);
         }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 5b2d4a659..3bd1a90a8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -1100,7 +1100,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             updateMetadata.set(PDF.INCREMENTAL_UPDATE_NUMBER, count);
             updateMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                     TikaCoreProperties.EmbeddedResourceType.VERSION.toString());
-            updateMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "incremental-update-" + count);
             if (embeddedDocumentExtractor.shouldParseEmbedded(updateMetadata)) {
                 try (InputStream tis = TikaInputStream.get(update)) {
                     context.set(IsIncrementalUpdate.class, IsIncrementalUpdate.IS_INCREMENTAL_UPDATE);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 5cd074c21..07c4e2d9e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -279,7 +279,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
             //don't count the last xref as an incremental update
             startXrefs--;
         }
-        metadata.set(PDF.PDF_INCREMENTAL_UPDATES, startXrefs);
+        metadata.set(PDF.PDF_INCREMENTAL_UPDATE_COUNT, startXrefs);
         if (localConfig.isParseIncrementalUpdates()) {
             try {
                 parseContext.set(IncrementalUpdateRecord.class,
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
index 86b4b6081..7b6c88626 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
@@ -67,7 +67,8 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
         List<Metadata> metadataList = getRecursiveMetadata(
                 "testPDF_incrementalUpdates.pdf",
                 parseContext);
-        assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+        assertEquals(2, metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT));
+        assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
         long[] expected = new long[]{16242, 41226, 64872};
         long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS);
         assertEquals(3, eofs.length);
@@ -155,7 +156,8 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
                 "testPDF_incrementalUpdates.pdf",
                 parseContext);
         assertEquals(3, metadataList.size());
-        assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+        assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
+        assertEquals(2, metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT));
         long[] expected = new long[]{16242, 41226, 64872};
         long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS);
         assertEquals(3, eofs.length);
@@ -170,9 +172,13 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
         assertNull(metadataList.get(0).get(PDF.INCREMENTAL_UPDATE_NUMBER));
         assertEquals(0, metadataList.get(1).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
         assertEquals(1, metadataList.get(2).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
-        assertEquals("incremental-update-0", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
-        assertEquals("incremental-update-1",
-                metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertEquals("/version-number-0",
+                metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+        assertEquals("/version-number-1",
+                metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+
+        assertNull(metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertNull(metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
 
         assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
                 metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ab0340afe..78f54c4f1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -477,7 +477,9 @@ public class PDFParserTest extends TikaTest {
         parseContext.set(PDFParserConfig.class, pdfParserConfig);
         List<Metadata> metadataList = getRecursiveMetadata("test-incremental-updates.eml", parseContext);
         assertEquals(4, metadataList.size());
-        assertEquals(2, metadataList.get(3).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+        assertEquals(2, metadataList.get(3).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
+        assertEquals(2,
+                metadataList.get(3).getInt(TikaCoreProperties.VERSION_COUNT));
         long[] expected = new long[]{16242, 41226, 64872};
         long[] eofs = metadataList.get(3).getLongValues(PDF.EOF_OFFSETS);
         assertEquals(3, eofs.length);
@@ -494,9 +496,11 @@ public class PDFParserTest extends TikaTest {
         assertNull(metadataList.get(3).get(PDF.INCREMENTAL_UPDATE_NUMBER));
         assertEquals(0, metadataList.get(1).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
         assertEquals(1, metadataList.get(2).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
-        assertEquals("incremental-update-0", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
-        assertEquals("incremental-update-1",
-                metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+        assertEquals("/testPDF_incrementalUpdates.pdf/version-number-0",
+                metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+        assertEquals("/testPDF_incrementalUpdates.pdf/version-number-1",
+                metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
 
         assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
                 metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));