You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/09 13:02:39 UTC
[tika] 01/01: TIKA-4033 -- improve metadata handling for incremental updates. Generalize incremental updates to "version" and avoid use of synthetic name for resourceName.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4033
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 532b47b93cf8816e4d884059d812a28c0866f257
Author: tallison <ta...@apache.org>
AuthorDate: Tue May 9 09:02:20 2023 -0400
TIKA-4033 -- improve metadata handling for incremental updates. Generalize incremental updates to "version" and avoid use of synthetic name for resourceName.
---
.../src/main/java/org/apache/tika/metadata/PDF.java | 20 ++++++++++++--------
.../org/apache/tika/metadata/TikaCoreProperties.java | 14 ++++++++++++++
.../apache/tika/parser/RecursiveParserWrapper.java | 2 ++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 1 -
.../java/org/apache/tika/parser/pdf/PDFParser.java | 2 +-
.../tika/parser/pdf/PDFIncrementalUpdatesTest.java | 16 +++++++++++-----
.../org/apache/tika/parser/pdf/PDFParserTest.java | 12 ++++++++----
7 files changed, 48 insertions(+), 19 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 0739855fc..c2baca0e8 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -28,12 +28,6 @@ public interface PDF {
String PDFAID_PREFIX = "pdfaid" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
- /**
- * Incremental updates as extracted by the StartXRefScanner. See
- * that class for limitations.
- */
- Property PDF_INCREMENTAL_UPDATES = Property.externalInteger(PDF_PREFIX + "incrementalUpdates");
-
/**
* Number of %%EOF as extracted by the StartXRefScanner. See
* that class for limitations.
@@ -203,6 +197,16 @@ public interface PDF {
* This value is populated with the parse incremental updates feature is selected
* in the PDFParser.
*/
- Property INCREMENTAL_UPDATE_NUMBER = Property.internalInteger(PDF_PREFIX +
- "incrementalUpdateNumber");
+ Property INCREMENTAL_UPDATE_NUMBER =
+ Property.composite(Property.internalInteger(PDF_PREFIX + "incrementalUpdateNumber"),
+ new Property[]{ TikaCoreProperties.VERSION_NUMBER });
+
+ /**
+ * Incremental updates as extracted by the StartXRefScanner. See
+ * that class for limitations.
+ */
+ Property PDF_INCREMENTAL_UPDATE_COUNT =
+ Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"),
+ new Property[]{ TikaCoreProperties.VERSION_COUNT });
+
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 95cf0a035..b49068c39 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -322,6 +322,20 @@ public interface TikaCoreProperties {
//is the file encrypted
Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted");
+ /**
+ * General metadata key for the count of non-final versions available within a file. This
+ * was added initially to support generalizing incremental updates in PDF.
+ */
+ Property VERSION_COUNT = Property.externalInteger(TIKA_META_PREFIX + "versionCount");
+
+ /**
+ * General metadata key for the version number of a given file that contains
+ * earlier versions within it. This number is 0-indexed for the earliest version.
+ * The latest version does not have this metadata value. This was added initially
+ * to support generalizing incremental updates in PDF.
+ */
+ Property VERSION_NUMBER = Property.externalInteger(TIKA_META_PREFIX + "versionNumber");
+
/**
* A file might contain different types of embedded documents.
* The most common is the ATTACHMENT.
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 101aa3395..483181b0a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -188,6 +188,8 @@ public class RecursiveParserWrapper extends ParserDecorator {
objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
} else if (metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID) != null) {
objectName = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID);
+ } else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) {
+ objectName = "version-number-" + metadata.get(TikaCoreProperties.VERSION_NUMBER);
} else {
objectName = "embedded-" + (++state.unknownCount);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 5b2d4a659..3bd1a90a8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -1100,7 +1100,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
updateMetadata.set(PDF.INCREMENTAL_UPDATE_NUMBER, count);
updateMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.VERSION.toString());
- updateMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "incremental-update-" + count);
if (embeddedDocumentExtractor.shouldParseEmbedded(updateMetadata)) {
try (InputStream tis = TikaInputStream.get(update)) {
context.set(IsIncrementalUpdate.class, IsIncrementalUpdate.IS_INCREMENTAL_UPDATE);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 5cd074c21..07c4e2d9e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -279,7 +279,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
//don't count the last xref as an incremental update
startXrefs--;
}
- metadata.set(PDF.PDF_INCREMENTAL_UPDATES, startXrefs);
+ metadata.set(PDF.PDF_INCREMENTAL_UPDATE_COUNT, startXrefs);
if (localConfig.isParseIncrementalUpdates()) {
try {
parseContext.set(IncrementalUpdateRecord.class,
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
index 86b4b6081..7b6c88626 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFIncrementalUpdatesTest.java
@@ -67,7 +67,8 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
List<Metadata> metadataList = getRecursiveMetadata(
"testPDF_incrementalUpdates.pdf",
parseContext);
- assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+ assertEquals(2, metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT));
+ assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
long[] expected = new long[]{16242, 41226, 64872};
long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS);
assertEquals(3, eofs.length);
@@ -155,7 +156,8 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
"testPDF_incrementalUpdates.pdf",
parseContext);
assertEquals(3, metadataList.size());
- assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+ assertEquals(2, metadataList.get(0).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
+ assertEquals(2, metadataList.get(0).getInt(TikaCoreProperties.VERSION_COUNT));
long[] expected = new long[]{16242, 41226, 64872};
long[] eofs = metadataList.get(0).getLongValues(PDF.EOF_OFFSETS);
assertEquals(3, eofs.length);
@@ -170,9 +172,13 @@ public class PDFIncrementalUpdatesTest extends TikaTest {
assertNull(metadataList.get(0).get(PDF.INCREMENTAL_UPDATE_NUMBER));
assertEquals(0, metadataList.get(1).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
assertEquals(1, metadataList.get(2).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
- assertEquals("incremental-update-0", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertEquals("incremental-update-1",
- metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("/version-number-0",
+ metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ assertEquals("/version-number-1",
+ metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+
+ assertNull(metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertNull(metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ab0340afe..78f54c4f1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -477,7 +477,9 @@ public class PDFParserTest extends TikaTest {
parseContext.set(PDFParserConfig.class, pdfParserConfig);
List<Metadata> metadataList = getRecursiveMetadata("test-incremental-updates.eml", parseContext);
assertEquals(4, metadataList.size());
- assertEquals(2, metadataList.get(3).getInt(PDF.PDF_INCREMENTAL_UPDATES));
+ assertEquals(2, metadataList.get(3).getInt(PDF.PDF_INCREMENTAL_UPDATE_COUNT));
+ assertEquals(2,
+ metadataList.get(3).getInt(TikaCoreProperties.VERSION_COUNT));
long[] expected = new long[]{16242, 41226, 64872};
long[] eofs = metadataList.get(3).getLongValues(PDF.EOF_OFFSETS);
assertEquals(3, eofs.length);
@@ -494,9 +496,11 @@ public class PDFParserTest extends TikaTest {
assertNull(metadataList.get(3).get(PDF.INCREMENTAL_UPDATE_NUMBER));
assertEquals(0, metadataList.get(1).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
assertEquals(1, metadataList.get(2).getInt(PDF.INCREMENTAL_UPDATE_NUMBER));
- assertEquals("incremental-update-0", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertEquals("incremental-update-1",
- metadataList.get(2).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+
+ assertEquals("/testPDF_incrementalUpdates.pdf/version-number-0",
+ metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+ assertEquals("/testPDF_incrementalUpdates.pdf/version-number-1",
+ metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
assertEquals(TikaCoreProperties.EmbeddedResourceType.VERSION.toString(),
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));