You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/03/07 16:14:10 UTC
tika git commit: TIKA-1894: Add XMPMM support to PDFParser and
JpegParser via Jempbox
Repository: tika
Updated Branches:
refs/heads/master 74e71ebd8 -> c5d4ec6c5
TIKA-1894: Add XMPMM support to PDFParser and JpegParser via Jempbox
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/c5d4ec6c
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/c5d4ec6c
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/c5d4ec6c
Branch: refs/heads/master
Commit: c5d4ec6c50824a9a40fdd2b492bf7557d8d693f3
Parents: 74e71eb
Author: tballison <ta...@mitre.org>
Authored: Mon Mar 7 10:12:55 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Mon Mar 7 10:12:55 2016 -0500
----------------------------------------------------------------------
CHANGES.txt | 4 +-
.../java/org/apache/tika/metadata/XMPMM.java | 44 ++++++
.../tika/parser/image/xmp/JempboxExtractor.java | 138 +++++++++++++++----
.../org/apache/tika/parser/pdf/PDFParser.java | 17 ++-
.../apache/tika/parser/jpeg/JpegParserTest.java | 16 +++
.../apache/tika/parser/pdf/PDFParserTest.java | 57 ++++++++
6 files changed, 246 insertions(+), 30 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index a451feb..91bc623 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.13 - ???
+ * Add XMPMM support to PDFParser and JpegParser via Jempbox (TIKA-1894).
+
* Move serialization of TikaConfig to tika-core and enable dumping
of the config file via tika-app (TIKA-1657).
@@ -17,7 +19,7 @@ Release 1.13 - ???
* Upgrade to PDFBox 1.8.11 (TIKA-1830).
* Upgrade to Jackson 2.7.1 (TIKA-1869).
-i
+
* Upgrade to Apache SIS 0.6 (TIKA-1878).
* RichTextContentHandler moved from the Server package to Core (TIKA-1870).
http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java
index 3fc4dfa..1a5ef6d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java
@@ -72,4 +72,48 @@ public interface XMPMM {
Property RENDITION_PARAMS = Property.externalText(
PREFIX_ + "RenditionParams");
+ /**
+ * Instance id in the XMPMM's history section
+ */
+ Property HISTORY_EVENT_INSTANCEID = Property.externalTextBag(
+ PREFIX_+"History:InstanceID"
+ );
+
+ /**
+ * Action in the XMPMM's history section
+ */
+ Property HISTORY_ACTION = Property.externalTextBag(
+ PREFIX_+"History:Action"
+ );
+ /**
+ * When the action occurred in the XMPMM's history section
+ */
+ Property HISTORY_WHEN = Property.externalTextBag(
+ PREFIX_+"History:When"
+ );
+
+ /**
+ * Software agent that created the action in the XMPMM's
+ * history section
+ */
+ Property HISTORY_SOFTWARE_AGENT = Property.externalTextBag(
+ PREFIX_+"History:SoftwareAgent"
+ );
+
+ /**
+ * Document id for the document that this document
+ * was derived from
+ */
+ Property DERIVED_FROM_DOCUMENTID = Property.externalText(
+ PREFIX_+"DerivedFrom:DocumentID"
+ );
+
+ /**
+ * Instance id for the document instance that this
+ * document was derived from
+ */
+ Property DERIVED_FROM_INSTANCEID = Property.externalText(
+ PREFIX_+"DerivedFrom:InstanceID"
+ );
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
index 10692b8..cd18907 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
@@ -16,23 +16,30 @@
*/
package org.apache.tika.parser.image.xmp;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
+import java.util.Calendar;
import java.util.List;
+import org.apache.jempbox.xmp.ResourceEvent;
+import org.apache.jempbox.xmp.ResourceRef;
import org.apache.jempbox.xmp.XMPMetadata;
import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.jempbox.xmp.XMPSchemaMediaManagement;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.utils.DateUtils;
import org.xml.sax.InputSource;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
public class JempboxExtractor {
// The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8"
@@ -53,32 +60,41 @@ public class JempboxExtractor {
Reader decoded = new InputStreamReader(
new ByteArrayInputStream(xmpraw.toByteArray()),
DEFAULT_XMP_CHARSET);
+ XMPMetadata xmp = null;
try {
- XMPMetadata xmp = XMPMetadata.load(new InputSource(decoded));
- XMPSchemaDublinCore dc = xmp.getDublinCoreSchema();
- if (dc != null) {
- if (dc.getTitle() != null) {
- metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
- }
- if (dc.getDescription() != null) {
- metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
- }
- if (dc.getCreators() != null && dc.getCreators().size() > 0) {
- metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators()));
- }
- if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
- for (String keyword : dc.getSubjects()) {
- metadata.add(TikaCoreProperties.KEYWORDS, keyword);
- }
- // TODO should we set KEYWORDS too?
- // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject
+ xmp = XMPMetadata.load(new InputSource(decoded));
+ } catch (IOException e) {
+ //
+ }
+
+ if (xmp == null) {
+ return;
+ }
+ XMPSchemaDublinCore dc = null;
+ try {
+ dc = xmp.getDublinCoreSchema();
+ } catch (IOException e) {
+ }
+
+ if (dc != null) {
+ if (dc.getTitle() != null) {
+ metadata.set(TikaCoreProperties.TITLE, dc.getTitle());
+ }
+ if (dc.getDescription() != null) {
+ metadata.set(TikaCoreProperties.DESCRIPTION, dc.getDescription());
+ }
+ if (dc.getCreators() != null && dc.getCreators().size() > 0) {
+ metadata.set(TikaCoreProperties.CREATOR, joinCreators(dc.getCreators()));
+ }
+ if (dc.getSubjects() != null && dc.getSubjects().size() > 0) {
+ for (String keyword : dc.getSubjects()) {
+ metadata.add(TikaCoreProperties.KEYWORDS, keyword);
}
+ // TODO should we set KEYWORDS too?
+ // All tested photo managers set the same in Iptc.Application2.Keywords and Xmp.dc.subject
}
- } catch (IOException e) {
- // Could not parse embedded XMP metadata. That's not a serious
- // problem, so we'll just ignore the issue for now.
- // TODO: Make error handling like this configurable.
}
+ extractXMPMM(xmp, metadata);
}
protected String joinCreators(List<String> creators) {
@@ -94,4 +110,78 @@ public class JempboxExtractor {
}
return c.substring(2);
}
+
+ /**
+ * Extracts Media Management metadata from XMP.
+ *
+ * Silently swallows exceptions.
+ * @param xmp
+ * @param metadata
+ */
+ public static void extractXMPMM(XMPMetadata xmp, Metadata metadata) {
+ XMPSchemaMediaManagement mmSchema = null;
+ try {
+ mmSchema = xmp.getMediaManagementSchema();
+ } catch (IOException e) {
+ //swallow
+ return;
+ }
+ if (mmSchema != null) {
+ addMetadata(metadata, XMPMM.DOCUMENTID, mmSchema.getDocumentID());
+ //not currently supported by JempBox...
+// metadata.set(XMPMM.INSTANCEID, mmSchema.getInstanceID());
+
+ ResourceRef derivedFrom = mmSchema.getDerivedFrom();
+ if (derivedFrom != null) {
+ try {
+ addMetadata(metadata, XMPMM.DERIVED_FROM_DOCUMENTID, derivedFrom.getDocumentID());
+ } catch (NullPointerException e) {}
+
+ try {
+ addMetadata(metadata, XMPMM.DERIVED_FROM_INSTANCEID, derivedFrom.getInstanceID());
+ } catch (NullPointerException e) {}
+
+ //TODO: not yet supported by XMPBox...extract OriginalDocumentID
+ //in DerivedFrom section
+ }
+ if (mmSchema.getHistory() != null) {
+ for (ResourceEvent stevt : mmSchema.getHistory()) {
+ String instanceId = null;
+ String action = null;
+ Calendar when = null;
+ String softwareAgent = null;
+ try {
+ instanceId = stevt.getInstanceID();
+ action = stevt.getAction();
+ when = stevt.getWhen();
+ softwareAgent = stevt.getSoftwareAgent();
+
+ //instanceid can throw npe; getWhen can throw IOException
+ } catch (NullPointerException|IOException e) {
+ //swallow
+ }
+ if (instanceId != null && instanceId.trim().length() > 0) {
+ //for absent data elements, pass in empty strings so
+ //that parallel arrays will have matching offsets
+ //for absent data
+
+ action = (action == null) ? "" : action;
+ String dateString = (when == null) ? "" : DateUtils.formatDate(when);
+ softwareAgent = (softwareAgent == null) ? "" : softwareAgent;
+
+ metadata.add(XMPMM.HISTORY_EVENT_INSTANCEID, instanceId);
+ metadata.add(XMPMM.HISTORY_ACTION, action);
+ metadata.add(XMPMM.HISTORY_WHEN, dateString);
+ metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent);
+ }
+ }
+ }
+ }
+ }
+
+ private static void addMetadata(Metadata m, Property p, String value) {
+ if (value != null) {
+ m.add(p, value);
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 29ebddf..8cb1b98 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -30,6 +30,7 @@ import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.jempbox.xmp.XMPSchema;
import org.apache.jempbox.xmp.XMPSchemaDublinCore;
+import org.apache.jempbox.xmp.XMPSchemaMediaManagement;
import org.apache.jempbox.xmp.pdfa.XMPSchemaPDFAId;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
@@ -58,6 +59,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -223,19 +225,24 @@ public class PDFParser extends AbstractParser {
Boolean.toString(ap.canPrintDegraded()));
- //now go for the XMP stuff
+ //now go for the XMP
org.apache.jempbox.xmp.XMPMetadata xmp = null;
XMPSchemaDublinCore dcSchema = null;
+ XMPSchemaMediaManagement mmSchema = null;
try {
if (document.getDocumentCatalog().getMetadata() != null) {
xmp = document.getDocumentCatalog().getMetadata().exportXMPMetadata();
}
- if (xmp != null) {
+ } catch (IOException e) {}
+
+ if (xmp != null) {
+ try {
dcSchema = xmp.getDublinCoreSchema();
- }
- } catch (IOException e) {
- //swallow
+ } catch (IOException e) {}
+
+ JempboxExtractor.extractXMPMM(xmp, metadata);
}
+
PDDocumentInformation info = document.getDocumentInformation();
metadata.set(PagedText.N_PAGES, document.getNumberOfPages());
extractMultilingualItems(metadata, TikaCoreProperties.TITLE, info.getTitle(), dcSchema);
http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
index fd7ee29..6c90680 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
@@ -27,6 +27,7 @@ import java.util.List;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TIFF;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.junit.Test;
@@ -247,4 +248,19 @@ public class JpegParserTest {
assertEquals("300.0", metadata.get(TIFF.RESOLUTION_HORIZONTAL));
assertEquals("300.0", metadata.get(TIFF.RESOLUTION_VERTICAL));
}
+
+ @Test
+ public void testJPEGXMPMM() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_EXIF_emptyDateTime.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ //TODO: when jempbox is fixed/xmpbox is used
+ //add tests for history...currently not extracted
+ assertEquals("xmp.did:49E997348D4911E1AB62EBF9B374B234",
+ metadata.get(XMPMM.DOCUMENTID));
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/c5d4ec6c/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 04d9f2b..47f3e0a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.pdf;
+import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNull;
@@ -31,6 +32,7 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
+
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
@@ -45,6 +47,7 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
@@ -1358,6 +1361,60 @@ public class PDFParserTest extends TikaTest {
assertNotContained("Mount Rushmore National Memorial", xml);
}
+ @Test
+ public void testXMPMM() throws Exception {
+// XMLResult r = getXML("testPDF_Version.11.x.PDFA-1b.pdf");
+ Metadata m = getXML("testPDF_twoAuthors.pdf").metadata;
+ assertEquals("uuid:0e46913c-72b9-40c0-8232-69e362abcd1e",
+ m.get(XMPMM.DOCUMENTID));
+
+ m = getXML("testPDF_Version.11.x.PDFA-1b.pdf").metadata;
+ assertEquals("uuid:cccee1fc-51b3-4b52-ac86-672af3974d25",
+ m.get(XMPMM.DOCUMENTID));
+
+ //now test for 7 elements in each parallel array
+ //from the history section
+ assertArrayEquals(new String[]{
+ "uuid:0313504b-a0b0-4dac-a9f0-357221f2eadf",
+ "uuid:edc4279e-0d5f-465e-b13e-1298402fd11c",
+ "uuid:f565b775-43f3-4a9a-8541-e98c4115db6d",
+ "uuid:9fd5e0a8-14a5-4920-ad7f-870c0b8ee65f",
+ "uuid:09b6cfba-efde-4e07-a77f-70de858cc0aa",
+ "uuid:1e4ffbd7-dabc-4aae-801c-15b3404ade36",
+ "uuid:c1669773-a6ca-4bdd-aade-519030d0af00"
+ }, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID));
+
+ assertArrayEquals(new String[]{
+ "converted",
+ "converted",
+ "converted",
+ "converted",
+ "converted",
+ "converted",
+ "converted"
+ }, m.getValues(XMPMM.HISTORY_ACTION));
+
+ assertArrayEquals(new String[]{
+ "Preflight",
+ "Preflight",
+ "Preflight",
+ "Preflight",
+ "Preflight",
+ "Preflight",
+ "Preflight"
+ }, m.getValues(XMPMM.HISTORY_SOFTWARE_AGENT));
+
+ assertArrayEquals(new String[]{
+ "2014-03-04T23:50:41Z",
+ "2014-03-04T23:50:42Z",
+ "2014-03-04T23:51:34Z",
+ "2014-03-04T23:51:36Z",
+ "2014-03-04T23:51:37Z",
+ "2014-03-04T23:52:22Z",
+ "2014-03-04T23:54:48Z"
+ }, m.getValues(XMPMM.HISTORY_WHEN));
+ }
+
private void assertException(String path, Parser parser, ParseContext context, Class expected) {
boolean noEx = false;
InputStream is = getResourceAsStream(path);