You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/08 15:45:40 UTC

tika git commit: TIKA-1999 add limit to number of events extracted from the XMPMM section by the JempboxExtractor

Repository: tika
Updated Branches:
  refs/heads/master 1af1078ad -> 3e1450538


TIKA-1999 add limit to number of events extracted from the XMPMM section by the JempboxExtractor


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3e145053
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3e145053
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3e145053

Branch: refs/heads/master
Commit: 3e14505381eefa603adabe61171c0c19fc685b2f
Parents: 1af1078
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 8 11:45:30 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 8 11:45:30 2016 -0400

----------------------------------------------------------------------
 .../tika/parser/image/xmp/JempboxExtractor.java |  31 ++++
 .../parser/image/xmp/JempboxExtractorTest.java  |  29 ++-
 .../test/resources/test-documents/testXMP.xmp   | 178 +++++++++++++++++++
 3 files changed, 237 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
index 0f326a8..d9ae71d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
@@ -42,15 +42,21 @@ import org.xml.sax.SAXException;
 
 public class JempboxExtractor {
 
+
+    private static int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
+
     // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8"
     private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
+
     private XMPPacketScanner scanner = new XMPPacketScanner();
     private Metadata metadata;
+    private static int maxXMPMMHistory;
 
     public JempboxExtractor(Metadata metadata) {
         this.metadata = metadata;
     }
 
+
     public void parse(InputStream file) throws IOException, TikaException {
         ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
         if (!scanner.parse(file, xmpraw)) {
@@ -160,7 +166,11 @@ public class JempboxExtractor {
                 //in DerivedFrom section
             }
             if (mmSchema.getHistory() != null) {
+                int eventsAdded = 0;
                 for (ResourceEvent stevt : mmSchema.getHistory()) {
+                    if (eventsAdded >= MAX_EVENT_HISTORY_IN_XMPMM) {
+                        break;
+                    }
                     String instanceId = null;
                     String action = null;
                     Calendar when = null;
@@ -188,6 +198,7 @@ public class JempboxExtractor {
                         metadata.add(XMPMM.HISTORY_ACTION, action);
                         metadata.add(XMPMM.HISTORY_WHEN, dateString);
                         metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent);
+                        eventsAdded++;
                     }
                 }
             }
@@ -199,4 +210,24 @@ public class JempboxExtractor {
             m.add(p, value);
         }
     }
+
+    /**
+     * Maximum number of events to extract from the
+     * event history in the XMP Media Management (XMPMM) section.
+     * The extractor will silently stop adding events after it
+     * has reached this threshold.
+     * <p>
+     * The default is 1024.
+     */
+    public static void setMaxXMPMMHistory(int maxEvents) {
+        MAX_EVENT_HISTORY_IN_XMPMM = maxEvents;
+    }
+
+    /**
+     *
+     * @return maximum number of events to extract from the XMPMM history.
+     */
+    public static int getMaxXMPMMHistory() {
+        return maxXMPMMHistory;
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
index 4718539..cdbf5eb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
@@ -19,17 +19,24 @@ package org.apache.tika.parser.image.xmp;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.Collection;
 
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.parser.ParseContext;
 import org.junit.Test;
 
-public class JempboxExtractorTest {
+import javax.xml.parsers.DocumentBuilder;
+
+public class JempboxExtractorTest extends TikaTest {
 
     @Test
     public void testParseJpeg() throws IOException, TikaException {
@@ -104,4 +111,24 @@ public class JempboxExtractorTest {
                 Arrays.asList("Mr B", "Mr A")));
     }
 
+    @Test
+    public void testMaxXMPMMHistory() throws Exception {
+        int maxHistory = JempboxExtractor.getMaxXMPMMHistory();
+        try {
+            Metadata m = new Metadata();
+            JempboxExtractor ex = new JempboxExtractor(m);
+            ex.parse(getResourceAsStream("/test-documents/testXMP.xmp"));
+            assertEquals(7, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length);
+
+            JempboxExtractor.setMaxXMPMMHistory(5);
+            m = new Metadata();
+            ex = new JempboxExtractor(m);
+            ex.parse(getResourceAsStream("/test-documents/testXMP.xmp"));
+            assertEquals(5, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length);
+        } finally {
+            //if something goes wrong, make sure to set this back to what it was
+            JempboxExtractor.setMaxXMPMMHistory(maxHistory);
+        }
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/test/resources/test-documents/testXMP.xmp
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testXMP.xmp b/tika-parsers/src/test/resources/test-documents/testXMP.xmp
new file mode 100644
index 0000000..00fe0f9
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testXMP.xmp
@@ -0,0 +1,178 @@
+<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?>
+<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core 5.4-c005 78.147326, 2012/08/23-13:03:03        ">
+   <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+      <rdf:Description rdf:about=""
+            xmlns:xmp="http://ns.adobe.com/xap/1.0/"
+            xmlns:dc="http://purl.org/dc/elements/1.1/"
+            xmlns:xmpMM="http://ns.adobe.com/xap/1.0/mm/"
+            xmlns:stEvt="http://ns.adobe.com/xap/1.0/sType/ResourceEvent#"
+            xmlns:pdf="http://ns.adobe.com/pdf/1.3/"
+            xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/"
+            xmlns:pdfaExtension="http://www.aiim.org/pdfa/ns/extension/"
+            xmlns:pdfaSchema="http://www.aiim.org/pdfa/ns/schema#"
+            xmlns:pdfaProperty="http://www.aiim.org/pdfa/ns/property#">
+         <xmp:CreateDate>2014-03-04T21:56:45+01:00</xmp:CreateDate>
+         <xmp:CreatorTool>Adobe Acrobat 10.0</xmp:CreatorTool>
+         <xmp:ModifyDate>2014-03-04T23:54:48+01:00</xmp:ModifyDate>
+         <xmp:MetadataDate>2014-03-04T23:54:48+01:00</xmp:MetadataDate>
+         <dc:format>application/pdf</dc:format>
+         <dc:title>
+            <rdf:Alt>
+               <rdf:li xml:lang="x-default">Sample Acrobat 4.x (PDF Version 1.3)</rdf:li>
+            </rdf:Alt>
+         </dc:title>
+         <dc:creator>
+            <rdf:Bag/>
+         </dc:creator>
+         <xmpMM:DocumentID>uuid:cccee1fc-51b3-4b52-ac86-672af3974d25</xmpMM:DocumentID>
+         <xmpMM:InstanceID>uuid:afa71b09-7cc5-48ac-8664-ac6dcf8b5ab4</xmpMM:InstanceID>
+         <xmpMM:RenditionClass>default</xmpMM:RenditionClass>
+         <xmpMM:VersionID>1</xmpMM:VersionID>
+         <xmpMM:History>
+            <rdf:Seq>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:0313504b-a0b0-4dac-a9f0-357221f2eadf</stEvt:instanceID>
+                  <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:50:41+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:edc4279e-0d5f-465e-b13e-1298402fd11c</stEvt:instanceID>
+                  <stEvt:parameters>PDF/A conversion failed; Version and conformance level identification removed</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:50:42+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:f565b775-43f3-4a9a-8541-e98c4115db6d</stEvt:instanceID>
+                  <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:51:34+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:9fd5e0a8-14a5-4920-ad7f-870c0b8ee65f</stEvt:instanceID>
+                  <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:51:36+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:09b6cfba-efde-4e07-a77f-70de858cc0aa</stEvt:instanceID>
+                  <stEvt:parameters>PDF/A conversion failed; Version and conformance level identification removed</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:51:37+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:1e4ffbd7-dabc-4aae-801c-15b3404ade36</stEvt:instanceID>
+                  <stEvt:parameters>converted to PDF/A-1b</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:52:22+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:c1669773-a6ca-4bdd-aade-519030d0af00</stEvt:instanceID>
+                  <stEvt:parameters>converted to PDF/A-1b</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:54:48+01:00</stEvt:when>
+               </rdf:li>
+            </rdf:Seq>
+         </xmpMM:History>
+         <pdf:Producer>Acrobat Web Capture 10.0</pdf:Producer>
+         <pdfaid:part>1</pdfaid:part>
+         <pdfaid:conformance>B</pdfaid:conformance>
+         <pdfaExtension:schemas>
+            <rdf:Bag>
+               <rdf:li rdf:parseType="Resource">
+                  <pdfaSchema:namespaceURI>http://ns.adobe.com/pdf/1.3/</pdfaSchema:namespaceURI>
+                  <pdfaSchema:prefix>pdf</pdfaSchema:prefix>
+                  <pdfaSchema:schema>Adobe PDF Schema</pdfaSchema:schema>
+                  <pdfaSchema:property>
+                     <rdf:Seq>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>A name object indicating whether the document has been modified to include trapping information</pdfaProperty:description>
+                           <pdfaProperty:name>Trapped</pdfaProperty:name>
+                           <pdfaProperty:valueType>Text</pdfaProperty:valueType>
+                        </rdf:li>
+                     </rdf:Seq>
+                  </pdfaSchema:property>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <pdfaSchema:namespaceURI>http://ns.adobe.com/xap/1.0/mm/</pdfaSchema:namespaceURI>
+                  <pdfaSchema:prefix>xmpMM</pdfaSchema:prefix>
+                  <pdfaSchema:schema>XMP Media Management Schema</pdfaSchema:schema>
+                  <pdfaSchema:property>
+                     <rdf:Seq>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>UUID based identifier for specific incarnation of a document</pdfaProperty:description>
+                           <pdfaProperty:name>InstanceID</pdfaProperty:name>
+                           <pdfaProperty:valueType>URI</pdfaProperty:valueType>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>The common identifier for all versions and renditions of a document.</pdfaProperty:description>
+                           <pdfaProperty:name>OriginalDocumentID</pdfaProperty:name>
+                           <pdfaProperty:valueType>URI</pdfaProperty:valueType>
+                        </rdf:li>
+                     </rdf:Seq>
+                  </pdfaSchema:property>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <pdfaSchema:namespaceURI>http://www.aiim.org/pdfa/ns/id/</pdfaSchema:namespaceURI>
+                  <pdfaSchema:prefix>pdfaid</pdfaSchema:prefix>
+                  <pdfaSchema:schema>PDF/A ID Schema</pdfaSchema:schema>
+                  <pdfaSchema:property>
+                     <rdf:Seq>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>Part of PDF/A standard</pdfaProperty:description>
+                           <pdfaProperty:name>part</pdfaProperty:name>
+                           <pdfaProperty:valueType>Integer</pdfaProperty:valueType>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>Amendment of PDF/A standard</pdfaProperty:description>
+                           <pdfaProperty:name>amd</pdfaProperty:name>
+                           <pdfaProperty:valueType>Text</pdfaProperty:valueType>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>Conformance level of PDF/A standard</pdfaProperty:description>
+                           <pdfaProperty:name>conformance</pdfaProperty:name>
+                           <pdfaProperty:valueType>Text</pdfaProperty:valueType>
+                        </rdf:li>
+                     </rdf:Seq>
+                  </pdfaSchema:property>
+               </rdf:li>
+            </rdf:Bag>
+         </pdfaExtension:schemas>
+      </rdf:Description>
+   </rdf:RDF>
+</x:xmpmeta>
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                           
+<?xpacket end="w"?>
\ No newline at end of file