You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/08 15:45:40 UTC
tika git commit: TIKA-1999 add limit to number of events extracted
from the XMPMM section by the JempboxExtractor
Repository: tika
Updated Branches:
refs/heads/master 1af1078ad -> 3e1450538
TIKA-1999 add limit to number of events extracted from the XMPMM section by the JempboxExtractor
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3e145053
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3e145053
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3e145053
Branch: refs/heads/master
Commit: 3e14505381eefa603adabe61171c0c19fc685b2f
Parents: 1af1078
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 8 11:45:30 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 8 11:45:30 2016 -0400
----------------------------------------------------------------------
.../tika/parser/image/xmp/JempboxExtractor.java | 31 ++++
.../parser/image/xmp/JempboxExtractorTest.java | 29 ++-
.../test/resources/test-documents/testXMP.xmp | 178 +++++++++++++++++++
3 files changed, 237 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
index 0f326a8..d9ae71d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
@@ -42,15 +42,21 @@ import org.xml.sax.SAXException;
public class JempboxExtractor {
+
+ private static int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
+
// The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8"
private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
+
private XMPPacketScanner scanner = new XMPPacketScanner();
private Metadata metadata;
+ private static int maxXMPMMHistory;
public JempboxExtractor(Metadata metadata) {
this.metadata = metadata;
}
+
public void parse(InputStream file) throws IOException, TikaException {
ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
if (!scanner.parse(file, xmpraw)) {
@@ -160,7 +166,11 @@ public class JempboxExtractor {
//in DerivedFrom section
}
if (mmSchema.getHistory() != null) {
+ int eventsAdded = 0;
for (ResourceEvent stevt : mmSchema.getHistory()) {
+ if (eventsAdded >= MAX_EVENT_HISTORY_IN_XMPMM) {
+ break;
+ }
String instanceId = null;
String action = null;
Calendar when = null;
@@ -188,6 +198,7 @@ public class JempboxExtractor {
metadata.add(XMPMM.HISTORY_ACTION, action);
metadata.add(XMPMM.HISTORY_WHEN, dateString);
metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent);
+ eventsAdded++;
}
}
}
@@ -199,4 +210,24 @@ public class JempboxExtractor {
m.add(p, value);
}
}
+
+ /**
+ * Maximum number of events to extract from the
+ * event history in the XMP Media Management (XMPMM) section.
+ * The extractor will silently stop adding events after it
+ * has reached this threshold.
+ * <p>
+ * The default is 1024.
+ */
+ public static void setMaxXMPMMHistory(int maxEvents) {
+ MAX_EVENT_HISTORY_IN_XMPMM = maxEvents;
+ }
+
+ /**
+ *
+ * @return maximum number of events to extract from the XMPMM history.
+ */
+ public static int getMaxXMPMMHistory() {
+ return maxXMPMMHistory;
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
index 4718539..cdbf5eb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
@@ -19,17 +19,24 @@ package org.apache.tika.parser.image.xmp;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collection;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.parser.ParseContext;
import org.junit.Test;
-public class JempboxExtractorTest {
+import javax.xml.parsers.DocumentBuilder;
+
+public class JempboxExtractorTest extends TikaTest {
@Test
public void testParseJpeg() throws IOException, TikaException {
@@ -104,4 +111,24 @@ public class JempboxExtractorTest {
Arrays.asList("Mr B", "Mr A")));
}
+ @Test
+ public void testMaxXMPMMHistory() throws Exception {
+ int maxHistory = JempboxExtractor.getMaxXMPMMHistory();
+ try {
+ Metadata m = new Metadata();
+ JempboxExtractor ex = new JempboxExtractor(m);
+ ex.parse(getResourceAsStream("/test-documents/testXMP.xmp"));
+ assertEquals(7, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length);
+
+ JempboxExtractor.setMaxXMPMMHistory(5);
+ m = new Metadata();
+ ex = new JempboxExtractor(m);
+ ex.parse(getResourceAsStream("/test-documents/testXMP.xmp"));
+ assertEquals(5, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length);
+ } finally {
+ //if something goes wrong, make sure to set this back to what it was
+ JempboxExtractor.setMaxXMPMMHistory(maxHistory);
+ }
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/test/resources/test-documents/testXMP.xmp
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testXMP.xmp b/tika-parsers/src/test/resources/test-documents/testXMP.xmp
new file mode 100644
index 0000000..00fe0f9
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testXMP.xmp
@@ -0,0 +1,178 @@
+<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?>
+<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core 5.4-c005 78.147326, 2012/08/23-13:03:03 ">
+ <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+ <rdf:Description rdf:about=""
+ xmlns:xmp="http://ns.adobe.com/xap/1.0/"
+ xmlns:dc="http://purl.org/dc/elements/1.1/"
+ xmlns:xmpMM="http://ns.adobe.com/xap/1.0/mm/"
+ xmlns:stEvt="http://ns.adobe.com/xap/1.0/sType/ResourceEvent#"
+ xmlns:pdf="http://ns.adobe.com/pdf/1.3/"
+ xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/"
+ xmlns:pdfaExtension="http://www.aiim.org/pdfa/ns/extension/"
+ xmlns:pdfaSchema="http://www.aiim.org/pdfa/ns/schema#"
+ xmlns:pdfaProperty="http://www.aiim.org/pdfa/ns/property#">
+ <xmp:CreateDate>2014-03-04T21:56:45+01:00</xmp:CreateDate>
+ <xmp:CreatorTool>Adobe Acrobat 10.0</xmp:CreatorTool>
+ <xmp:ModifyDate>2014-03-04T23:54:48+01:00</xmp:ModifyDate>
+ <xmp:MetadataDate>2014-03-04T23:54:48+01:00</xmp:MetadataDate>
+ <dc:format>application/pdf</dc:format>
+ <dc:title>
+ <rdf:Alt>
+ <rdf:li xml:lang="x-default">Sample Acrobat 4.x (PDF Version 1.3)</rdf:li>
+ </rdf:Alt>
+ </dc:title>
+ <dc:creator>
+ <rdf:Bag/>
+ </dc:creator>
+ <xmpMM:DocumentID>uuid:cccee1fc-51b3-4b52-ac86-672af3974d25</xmpMM:DocumentID>
+ <xmpMM:InstanceID>uuid:afa71b09-7cc5-48ac-8664-ac6dcf8b5ab4</xmpMM:InstanceID>
+ <xmpMM:RenditionClass>default</xmpMM:RenditionClass>
+ <xmpMM:VersionID>1</xmpMM:VersionID>
+ <xmpMM:History>
+ <rdf:Seq>
+ <rdf:li rdf:parseType="Resource">
+ <stEvt:action>converted</stEvt:action>
+ <stEvt:instanceID>uuid:0313504b-a0b0-4dac-a9f0-357221f2eadf</stEvt:instanceID>
+ <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters>
+ <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+ <stEvt:when>2014-03-04T23:50:41+01:00</stEvt:when>
+ </rdf:li>
+ <rdf:li rdf:parseType="Resource">
+ <stEvt:action>converted</stEvt:action>
+ <stEvt:instanceID>uuid:edc4279e-0d5f-465e-b13e-1298402fd11c</stEvt:instanceID>
+ <stEvt:parameters>PDF/A conversion failed; Version and conformance level identification removed</stEvt:parameters>
+ <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+ <stEvt:when>2014-03-04T23:50:42+01:00</stEvt:when>
+ </rdf:li>
+ <rdf:li rdf:parseType="Resource">
+ <stEvt:action>converted</stEvt:action>
+ <stEvt:instanceID>uuid:f565b775-43f3-4a9a-8541-e98c4115db6d</stEvt:instanceID>
+ <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters>
+ <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+ <stEvt:when>2014-03-04T23:51:34+01:00</stEvt:when>
+ </rdf:li>
+ <rdf:li rdf:parseType="Resource">
+ <stEvt:action>converted</stEvt:action>
+ <stEvt:instanceID>uuid:9fd5e0a8-14a5-4920-ad7f-870c0b8ee65f</stEvt:instanceID>
+ <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters>
+ <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+ <stEvt:when>2014-03-04T23:51:36+01:00</stEvt:when>
+ </rdf:li>
+ <rdf:li rdf:parseType="Resource">
+ <stEvt:action>converted</stEvt:action>
+ <stEvt:instanceID>uuid:09b6cfba-efde-4e07-a77f-70de858cc0aa</stEvt:instanceID>
+ <stEvt:parameters>PDF/A conversion failed; Version and conformance level identification removed</stEvt:parameters>
+ <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+ <stEvt:when>2014-03-04T23:51:37+01:00</stEvt:when>
+ </rdf:li>
+ <rdf:li rdf:parseType="Resource">
+ <stEvt:action>converted</stEvt:action>
+ <stEvt:instanceID>uuid:1e4ffbd7-dabc-4aae-801c-15b3404ade36</stEvt:instanceID>
+ <stEvt:parameters>converted to PDF/A-1b</stEvt:parameters>
+ <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+ <stEvt:when>2014-03-04T23:52:22+01:00</stEvt:when>
+ </rdf:li>
+ <rdf:li rdf:parseType="Resource">
+ <stEvt:action>converted</stEvt:action>
+ <stEvt:instanceID>uuid:c1669773-a6ca-4bdd-aade-519030d0af00</stEvt:instanceID>
+ <stEvt:parameters>converted to PDF/A-1b</stEvt:parameters>
+ <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+ <stEvt:when>2014-03-04T23:54:48+01:00</stEvt:when>
+ </rdf:li>
+ </rdf:Seq>
+ </xmpMM:History>
+ <pdf:Producer>Acrobat Web Capture 10.0</pdf:Producer>
+ <pdfaid:part>1</pdfaid:part>
+ <pdfaid:conformance>B</pdfaid:conformance>
+ <pdfaExtension:schemas>
+ <rdf:Bag>
+ <rdf:li rdf:parseType="Resource">
+ <pdfaSchema:namespaceURI>http://ns.adobe.com/pdf/1.3/</pdfaSchema:namespaceURI>
+ <pdfaSchema:prefix>pdf</pdfaSchema:prefix>
+ <pdfaSchema:schema>Adobe PDF Schema</pdfaSchema:schema>
+ <pdfaSchema:property>
+ <rdf:Seq>
+ <rdf:li rdf:parseType="Resource">
+ <pdfaProperty:category>internal</pdfaProperty:category>
+ <pdfaProperty:description>A name object indicating whether the document has been modified to include trapping information</pdfaProperty:description>
+ <pdfaProperty:name>Trapped</pdfaProperty:name>
+ <pdfaProperty:valueType>Text</pdfaProperty:valueType>
+ </rdf:li>
+ </rdf:Seq>
+ </pdfaSchema:property>
+ </rdf:li>
+ <rdf:li rdf:parseType="Resource">
+ <pdfaSchema:namespaceURI>http://ns.adobe.com/xap/1.0/mm/</pdfaSchema:namespaceURI>
+ <pdfaSchema:prefix>xmpMM</pdfaSchema:prefix>
+ <pdfaSchema:schema>XMP Media Management Schema</pdfaSchema:schema>
+ <pdfaSchema:property>
+ <rdf:Seq>
+ <rdf:li rdf:parseType="Resource">
+ <pdfaProperty:category>internal</pdfaProperty:category>
+ <pdfaProperty:description>UUID based identifier for specific incarnation of a document</pdfaProperty:description>
+ <pdfaProperty:name>InstanceID</pdfaProperty:name>
+ <pdfaProperty:valueType>URI</pdfaProperty:valueType>
+ </rdf:li>
+ <rdf:li rdf:parseType="Resource">
+ <pdfaProperty:category>internal</pdfaProperty:category>
+ <pdfaProperty:description>The common identifier for all versions and renditions of a document.</pdfaProperty:description>
+ <pdfaProperty:name>OriginalDocumentID</pdfaProperty:name>
+ <pdfaProperty:valueType>URI</pdfaProperty:valueType>
+ </rdf:li>
+ </rdf:Seq>
+ </pdfaSchema:property>
+ </rdf:li>
+ <rdf:li rdf:parseType="Resource">
+ <pdfaSchema:namespaceURI>http://www.aiim.org/pdfa/ns/id/</pdfaSchema:namespaceURI>
+ <pdfaSchema:prefix>pdfaid</pdfaSchema:prefix>
+ <pdfaSchema:schema>PDF/A ID Schema</pdfaSchema:schema>
+ <pdfaSchema:property>
+ <rdf:Seq>
+ <rdf:li rdf:parseType="Resource">
+ <pdfaProperty:category>internal</pdfaProperty:category>
+ <pdfaProperty:description>Part of PDF/A standard</pdfaProperty:description>
+ <pdfaProperty:name>part</pdfaProperty:name>
+ <pdfaProperty:valueType>Integer</pdfaProperty:valueType>
+ </rdf:li>
+ <rdf:li rdf:parseType="Resource">
+ <pdfaProperty:category>internal</pdfaProperty:category>
+ <pdfaProperty:description>Amendment of PDF/A standard</pdfaProperty:description>
+ <pdfaProperty:name>amd</pdfaProperty:name>
+ <pdfaProperty:valueType>Text</pdfaProperty:valueType>
+ </rdf:li>
+ <rdf:li rdf:parseType="Resource">
+ <pdfaProperty:category>internal</pdfaProperty:category>
+ <pdfaProperty:description>Conformance level of PDF/A standard</pdfaProperty:description>
+ <pdfaProperty:name>conformance</pdfaProperty:name>
+ <pdfaProperty:valueType>Text</pdfaProperty:valueType>
+ </rdf:li>
+ </rdf:Seq>
+ </pdfaSchema:property>
+ </rdf:li>
+ </rdf:Bag>
+ </pdfaExtension:schemas>
+ </rdf:Description>
+ </rdf:RDF>
+</x:xmpmeta>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<?xpacket end="w"?>
\ No newline at end of file