You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/06/02 13:52:50 UTC
[tika] 08/12: TIKA-3101 -- extract metadata from XMP basic schema
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3e32418dfa15a3fe19e219d4aaafdc82bb7e25f3
Author: tallison <ta...@apache.org>
AuthorDate: Mon Jun 1 10:03:00 2020 -0400
TIKA-3101 -- extract metadata from XMP basic schema
# Conflicts:
# tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
---
.../main/java/org/apache/tika/metadata/XMP.java | 15 ++++-
.../tika/parser/pdf/PDMetadataExtractor.java | 69 ++++++++++++++++++++-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 14 +++++
.../test-documents/testPDF_XMPBasicSchema.pdf | Bin 0 -> 1577 bytes
4 files changed, 94 insertions(+), 4 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
index 0f8c7fc..9a26920 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
@@ -26,6 +26,14 @@ public interface XMP {
String PREFIX_ = PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER;
/**
+ * An unordered array of text strings that unambiguously identify the resource
+ * within a given context. An array item may be qualified with xmpidq:Scheme
+ * (see 8.7, “xmpidq namespace”) to denote the formal identification system to
+ * which that identifier conforms.
+ */
+ Property ADVISORY = Property.externalTextBag(PREFIX_ + "Advisory");
+
+ /**
* The date and time the resource was created. For a digital file, this need not
* match a file-system creation time. For a freshly created resource, it should
* be close to that time, modulo the time taken to write the file. Later file
@@ -49,7 +57,7 @@ public interface XMP {
/**
* A word or short phrase that identifies a resource as a member of a userdefined collection.
*/
- Property LABEL = Property.externalDate(PREFIX_ + "Label");
+ Property LABEL = Property.externalText(PREFIX_ + "Label");
/**
* The date and time that any metadata for this resource was last changed. It
@@ -63,6 +71,11 @@ public interface XMP {
Property MODIFY_DATE = Property.externalDate(PREFIX_ + "ModifyDate");
/**
+ * A word or short phrase that identifies a resource as a member of a userdefined collection.
+ */
+ Property NICKNAME = Property.externalText(PREFIX_ + "NickName");
+
+ /**
* A user-assigned rating for this file. The value shall be -1 or in the range
* [0..5], where -1 indicates “rejected” and 0 indicates “unrated”. If xmp:Rating
* is not present, a value of 0 should be assumed.
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
index 374471b..16605cb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDMetadataExtractor.java
@@ -22,8 +22,10 @@ import java.util.Calendar;
import java.util.List;
import java.util.Locale;
+import org.apache.commons.lang3.StringUtils;
import org.apache.jempbox.xmp.XMPMetadata;
import org.apache.jempbox.xmp.XMPSchema;
+import org.apache.jempbox.xmp.XMPSchemaBasic;
import org.apache.jempbox.xmp.XMPSchemaDublinCore;
import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId;
import org.apache.pdfbox.cos.COSArray;
@@ -38,6 +40,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMP;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.image.xmp.JempboxExtractor;
@@ -66,21 +69,26 @@ class PDMetadataExtractor {
xmp = new XMPMetadata(dom);
}
XMPSchemaDublinCore dcSchema = null;
-
+ XMPSchemaBasic basic = null;
if (xmp != null) {
try {
dcSchema = xmp.getDublinCoreSchema();
} catch (IOException e) {
}
-
+ try {
+ basic = xmp.getBasicSchema();
+ } catch (IOException e) {
+ //swallow
+ }
JempboxExtractor.extractXMPMM(xmp, metadata);
}
-
extractMultilingualItems(metadata, TikaCoreProperties.DESCRIPTION, null, dcSchema);
extractDublinCoreListItems(metadata, TikaCoreProperties.CONTRIBUTOR, dcSchema);
extractDublinCoreListItems(metadata, TikaCoreProperties.CREATOR, dcSchema);
extractMultilingualItems(metadata, TikaCoreProperties.TITLE, null, dcSchema);
+ extractBasic(basic, metadata);
+
try {
if (xmp != null) {
xmp.addXMLNSMapping(XMPSchemaPDFAId.NAMESPACE, XMPSchemaPDFAId.class);
@@ -104,6 +112,61 @@ class PDMetadataExtractor {
}
}
+ private static void extractBasic(XMPSchemaBasic basic, Metadata metadata) {
+ if (basic == null) {
+ return;
+ }
+ //add the elements from the basic schema if they haven't already
+ //been extracted from dublin core
+ setNotNull(XMP.CREATOR_TOOL, basic.getCreatorTool(), metadata);
+ setNotNull(XMP.LABEL, basic.getLabel(), metadata);
+ try {
+ setNotNull(XMP.CREATE_DATE, basic.getCreateDate(), metadata);
+ } catch (IOException e) {
+ }
+ try {
+ setNotNull(XMP.MODIFY_DATE, basic.getModifyDate(), metadata);
+ } catch (IOException e) {
+ }
+ try {
+ setNotNull(XMP.METADATA_DATE, basic.getMetadataDate(), metadata);
+ } catch (IOException e) {
+ }
+
+ List<String> identifiers = basic.getIdentifiers();
+ if (identifiers != null) {
+ for (String identifier : identifiers) {
+ metadata.add(XMP.IDENTIFIER, identifier);
+ }
+ }
+ List<String> advisories = basic.getAdvisories();
+ if (advisories != null) {
+ for (String advisory : advisories) {
+ metadata.add(XMP.ADVISORY, advisory);
+ }
+ }
+ setNotNull(XMP.NICKNAME, basic.getNickname(), metadata);
+ setNotNull(XMP.RATING, basic.getRating(), metadata);
+ }
+
+ private static void setNotNull(Property property, String value, Metadata metadata) {
+ if (metadata.get(property) == null && ! StringUtils.isEmpty(value)) {
+ metadata.set(property, value);
+ }
+ }
+
+ private static void setNotNull(Property property, Calendar value, Metadata metadata) {
+ if (metadata.get(property) == null && value != null) {
+ metadata.set(property, value);
+ }
+ }
+
+ private static void setNotNull(Property property, Integer value, Metadata metadata) {
+ if (metadata.get(property) == null && value != null) {
+ metadata.set(property, value);
+ }
+ }
+
/**
* As of this writing, XMPSchema can contain bags or sequence lists
* for some attributes...despite standards documentation.
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 4e2e3c5..7547208 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -26,6 +26,8 @@ import static org.junit.Assert.fail;
import static org.junit.Assume.assumeTrue;
import java.io.InputStream;
+import java.nio.file.Paths;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@@ -54,6 +56,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMP;
import org.apache.tika.metadata.XMPMM;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
@@ -1579,6 +1582,15 @@ public class PDFParserTest extends TikaTest {
}
+ @Test
+ public void testXMPBasicSchema() throws Exception {
+ //TIKA-3101
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF_XMPBasicSchema.pdf");
+ Metadata m = metadataList.get(0);
+ //these two fields derive from the basic schema in the XMP, not dublin core
+ assertEquals("Hewlett-Packard MFP", m.get(XMP.CREATOR_TOOL));
+ assertEquals("1998-08-29T13:53:15Z", m.get(XMP.CREATE_DATE));
+ }
/**
* Simple class to count end of document events. If functionality is useful,
* move to org.apache.tika in src/test
@@ -1607,4 +1619,6 @@ public class PDFParserTest extends TikaTest {
return true;
}
}
+
+
}
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_XMPBasicSchema.pdf b/tika-parsers/src/test/resources/test-documents/testPDF_XMPBasicSchema.pdf
new file mode 100644
index 0000000..69c912e
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPDF_XMPBasicSchema.pdf differ