You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/13 15:17:36 UTC

[1/7] tika git commit: TIKA-1999 add limit to number of events extracted from the XMPMM section by the JempboxExtractor

Repository: tika
Updated Branches:
  refs/heads/TIKA-1508 e48d19156 -> ef1f7b9ec


TIKA-1999 add limit to number of events extracted from the XMPMM section by the JempboxExtractor


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/3e145053
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/3e145053
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/3e145053

Branch: refs/heads/TIKA-1508
Commit: 3e14505381eefa603adabe61171c0c19fc685b2f
Parents: 1af1078
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 8 11:45:30 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 8 11:45:30 2016 -0400

----------------------------------------------------------------------
 .../tika/parser/image/xmp/JempboxExtractor.java |  31 ++++
 .../parser/image/xmp/JempboxExtractorTest.java  |  29 ++-
 .../test/resources/test-documents/testXMP.xmp   | 178 +++++++++++++++++++
 3 files changed, 237 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
index 0f326a8..d9ae71d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
@@ -42,15 +42,21 @@ import org.xml.sax.SAXException;
 
 public class JempboxExtractor {
 
+
+    private static int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
+
     // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8"
     private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
+
     private XMPPacketScanner scanner = new XMPPacketScanner();
     private Metadata metadata;
+    private static int maxXMPMMHistory;
 
     public JempboxExtractor(Metadata metadata) {
         this.metadata = metadata;
     }
 
+
     public void parse(InputStream file) throws IOException, TikaException {
         ByteArrayOutputStream xmpraw = new ByteArrayOutputStream();
         if (!scanner.parse(file, xmpraw)) {
@@ -160,7 +166,11 @@ public class JempboxExtractor {
                 //in DerivedFrom section
             }
             if (mmSchema.getHistory() != null) {
+                int eventsAdded = 0;
                 for (ResourceEvent stevt : mmSchema.getHistory()) {
+                    if (eventsAdded >= MAX_EVENT_HISTORY_IN_XMPMM) {
+                        break;
+                    }
                     String instanceId = null;
                     String action = null;
                     Calendar when = null;
@@ -188,6 +198,7 @@ public class JempboxExtractor {
                         metadata.add(XMPMM.HISTORY_ACTION, action);
                         metadata.add(XMPMM.HISTORY_WHEN, dateString);
                         metadata.add(XMPMM.HISTORY_SOFTWARE_AGENT, softwareAgent);
+                        eventsAdded++;
                     }
                 }
             }
@@ -199,4 +210,24 @@ public class JempboxExtractor {
             m.add(p, value);
         }
     }
+
+    /**
+     * Maximum number of events to extract from the
+     * event history in the XMP Media Management (XMPMM) section.
+     * The extractor will silently stop adding events after it
+     * has reached this threshold.
+     * <p>
+     * The default is 1024.
+     */
+    public static void setMaxXMPMMHistory(int maxEvents) {
+        MAX_EVENT_HISTORY_IN_XMPMM = maxEvents;
+    }
+
+    /**
+     *
+     * @return maximum number of events to extract from the XMPMM history.
+     */
+    public static int getMaxXMPMMHistory() {
+        return maxXMPMMHistory;
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
index 4718539..cdbf5eb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
@@ -19,17 +19,24 @@ package org.apache.tika.parser.image.xmp;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.Collection;
 
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.XMPMM;
+import org.apache.tika.parser.ParseContext;
 import org.junit.Test;
 
-public class JempboxExtractorTest {
+import javax.xml.parsers.DocumentBuilder;
+
+public class JempboxExtractorTest extends TikaTest {
 
     @Test
     public void testParseJpeg() throws IOException, TikaException {
@@ -104,4 +111,24 @@ public class JempboxExtractorTest {
                 Arrays.asList("Mr B", "Mr A")));
     }
 
+    @Test
+    public void testMaxXMPMMHistory() throws Exception {
+        int maxHistory = JempboxExtractor.getMaxXMPMMHistory();
+        try {
+            Metadata m = new Metadata();
+            JempboxExtractor ex = new JempboxExtractor(m);
+            ex.parse(getResourceAsStream("/test-documents/testXMP.xmp"));
+            assertEquals(7, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length);
+
+            JempboxExtractor.setMaxXMPMMHistory(5);
+            m = new Metadata();
+            ex = new JempboxExtractor(m);
+            ex.parse(getResourceAsStream("/test-documents/testXMP.xmp"));
+            assertEquals(5, m.getValues(XMPMM.HISTORY_EVENT_INSTANCEID).length);
+        } finally {
+            //if something goes wrong, make sure to set this back to what it was
+            JempboxExtractor.setMaxXMPMMHistory(maxHistory);
+        }
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/3e145053/tika-parsers/src/test/resources/test-documents/testXMP.xmp
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testXMP.xmp b/tika-parsers/src/test/resources/test-documents/testXMP.xmp
new file mode 100644
index 0000000..00fe0f9
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testXMP.xmp
@@ -0,0 +1,178 @@
+<?xpacket begin="\ufeff" id="W5M0MpCehiHzreSzNTczkc9d"?>
+<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core 5.4-c005 78.147326, 2012/08/23-13:03:03        ">
+   <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+      <rdf:Description rdf:about=""
+            xmlns:xmp="http://ns.adobe.com/xap/1.0/"
+            xmlns:dc="http://purl.org/dc/elements/1.1/"
+            xmlns:xmpMM="http://ns.adobe.com/xap/1.0/mm/"
+            xmlns:stEvt="http://ns.adobe.com/xap/1.0/sType/ResourceEvent#"
+            xmlns:pdf="http://ns.adobe.com/pdf/1.3/"
+            xmlns:pdfaid="http://www.aiim.org/pdfa/ns/id/"
+            xmlns:pdfaExtension="http://www.aiim.org/pdfa/ns/extension/"
+            xmlns:pdfaSchema="http://www.aiim.org/pdfa/ns/schema#"
+            xmlns:pdfaProperty="http://www.aiim.org/pdfa/ns/property#">
+         <xmp:CreateDate>2014-03-04T21:56:45+01:00</xmp:CreateDate>
+         <xmp:CreatorTool>Adobe Acrobat 10.0</xmp:CreatorTool>
+         <xmp:ModifyDate>2014-03-04T23:54:48+01:00</xmp:ModifyDate>
+         <xmp:MetadataDate>2014-03-04T23:54:48+01:00</xmp:MetadataDate>
+         <dc:format>application/pdf</dc:format>
+         <dc:title>
+            <rdf:Alt>
+               <rdf:li xml:lang="x-default">Sample Acrobat 4.x (PDF Version 1.3)</rdf:li>
+            </rdf:Alt>
+         </dc:title>
+         <dc:creator>
+            <rdf:Bag/>
+         </dc:creator>
+         <xmpMM:DocumentID>uuid:cccee1fc-51b3-4b52-ac86-672af3974d25</xmpMM:DocumentID>
+         <xmpMM:InstanceID>uuid:afa71b09-7cc5-48ac-8664-ac6dcf8b5ab4</xmpMM:InstanceID>
+         <xmpMM:RenditionClass>default</xmpMM:RenditionClass>
+         <xmpMM:VersionID>1</xmpMM:VersionID>
+         <xmpMM:History>
+            <rdf:Seq>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:0313504b-a0b0-4dac-a9f0-357221f2eadf</stEvt:instanceID>
+                  <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:50:41+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:edc4279e-0d5f-465e-b13e-1298402fd11c</stEvt:instanceID>
+                  <stEvt:parameters>PDF/A conversion failed; Version and conformance level identification removed</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:50:42+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:f565b775-43f3-4a9a-8541-e98c4115db6d</stEvt:instanceID>
+                  <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:51:34+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:9fd5e0a8-14a5-4920-ad7f-870c0b8ee65f</stEvt:instanceID>
+                  <stEvt:parameters>converted to PDF/A-1a</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:51:36+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:09b6cfba-efde-4e07-a77f-70de858cc0aa</stEvt:instanceID>
+                  <stEvt:parameters>PDF/A conversion failed; Version and conformance level identification removed</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:51:37+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:1e4ffbd7-dabc-4aae-801c-15b3404ade36</stEvt:instanceID>
+                  <stEvt:parameters>converted to PDF/A-1b</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:52:22+01:00</stEvt:when>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <stEvt:action>converted</stEvt:action>
+                  <stEvt:instanceID>uuid:c1669773-a6ca-4bdd-aade-519030d0af00</stEvt:instanceID>
+                  <stEvt:parameters>converted to PDF/A-1b</stEvt:parameters>
+                  <stEvt:softwareAgent>Preflight</stEvt:softwareAgent>
+                  <stEvt:when>2014-03-04T23:54:48+01:00</stEvt:when>
+               </rdf:li>
+            </rdf:Seq>
+         </xmpMM:History>
+         <pdf:Producer>Acrobat Web Capture 10.0</pdf:Producer>
+         <pdfaid:part>1</pdfaid:part>
+         <pdfaid:conformance>B</pdfaid:conformance>
+         <pdfaExtension:schemas>
+            <rdf:Bag>
+               <rdf:li rdf:parseType="Resource">
+                  <pdfaSchema:namespaceURI>http://ns.adobe.com/pdf/1.3/</pdfaSchema:namespaceURI>
+                  <pdfaSchema:prefix>pdf</pdfaSchema:prefix>
+                  <pdfaSchema:schema>Adobe PDF Schema</pdfaSchema:schema>
+                  <pdfaSchema:property>
+                     <rdf:Seq>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>A name object indicating whether the document has been modified to include trapping information</pdfaProperty:description>
+                           <pdfaProperty:name>Trapped</pdfaProperty:name>
+                           <pdfaProperty:valueType>Text</pdfaProperty:valueType>
+                        </rdf:li>
+                     </rdf:Seq>
+                  </pdfaSchema:property>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <pdfaSchema:namespaceURI>http://ns.adobe.com/xap/1.0/mm/</pdfaSchema:namespaceURI>
+                  <pdfaSchema:prefix>xmpMM</pdfaSchema:prefix>
+                  <pdfaSchema:schema>XMP Media Management Schema</pdfaSchema:schema>
+                  <pdfaSchema:property>
+                     <rdf:Seq>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>UUID based identifier for specific incarnation of a document</pdfaProperty:description>
+                           <pdfaProperty:name>InstanceID</pdfaProperty:name>
+                           <pdfaProperty:valueType>URI</pdfaProperty:valueType>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>The common identifier for all versions and renditions of a document.</pdfaProperty:description>
+                           <pdfaProperty:name>OriginalDocumentID</pdfaProperty:name>
+                           <pdfaProperty:valueType>URI</pdfaProperty:valueType>
+                        </rdf:li>
+                     </rdf:Seq>
+                  </pdfaSchema:property>
+               </rdf:li>
+               <rdf:li rdf:parseType="Resource">
+                  <pdfaSchema:namespaceURI>http://www.aiim.org/pdfa/ns/id/</pdfaSchema:namespaceURI>
+                  <pdfaSchema:prefix>pdfaid</pdfaSchema:prefix>
+                  <pdfaSchema:schema>PDF/A ID Schema</pdfaSchema:schema>
+                  <pdfaSchema:property>
+                     <rdf:Seq>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>Part of PDF/A standard</pdfaProperty:description>
+                           <pdfaProperty:name>part</pdfaProperty:name>
+                           <pdfaProperty:valueType>Integer</pdfaProperty:valueType>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>Amendment of PDF/A standard</pdfaProperty:description>
+                           <pdfaProperty:name>amd</pdfaProperty:name>
+                           <pdfaProperty:valueType>Text</pdfaProperty:valueType>
+                        </rdf:li>
+                        <rdf:li rdf:parseType="Resource">
+                           <pdfaProperty:category>internal</pdfaProperty:category>
+                           <pdfaProperty:description>Conformance level of PDF/A standard</pdfaProperty:description>
+                           <pdfaProperty:name>conformance</pdfaProperty:name>
+                           <pdfaProperty:valueType>Text</pdfaProperty:valueType>
+                        </rdf:li>
+                     </rdf:Seq>
+                  </pdfaSchema:property>
+               </rdf:li>
+            </rdf:Bag>
+         </pdfaExtension:schemas>
+      </rdf:Description>
+   </rdf:RDF>
+</x:xmpmeta>
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                                                                                                    
+                           
+<?xpacket end="w"?>
\ No newline at end of file


[7/7] tika git commit: fix conflict

Posted by ta...@apache.org.
fix conflict


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ef1f7b9e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ef1f7b9e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ef1f7b9e

Branch: refs/heads/TIKA-1508
Commit: ef1f7b9ec1b39d957450f3b8a11d045579068e6d
Parents: 2140858
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 13 11:17:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 13 11:17:27 2016 -0400

----------------------------------------------------------------------
 tika-core/src/main/java/org/apache/tika/config/TikaConfig.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/ef1f7b9e/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 84fd636..e76b6e6 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -564,10 +564,10 @@ public class TikaConfig {
                     // See the thread "Configuring parsers and translators" for details 
                 }
 
-                //if the instance is configurable, then call configure()
                 Map<String, Param<?>> params = getParams(element);
                 //Assigning the params to bean fields/setters
                 AnnotationUtils.assignFieldParams(loaded, params);
+
                 // Have any decoration performed, eg explicit mimetypes
                 loaded = decorate(loaded, element);
                 // All done with setup


[3/7] tika git commit: TIKA-1996 -- upgrade to PDFBox 2.0.2

Posted by ta...@apache.org.
TIKA-1996 -- upgrade to PDFBox 2.0.2


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/06633cc1
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/06633cc1
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/06633cc1

Branch: refs/heads/TIKA-1508
Commit: 06633cc18df73c1cf4d19092a641a5355e19ac4c
Parents: 99aa587
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 13 09:21:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 13 09:21:27 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt          | 2 ++
 tika-parsers/pom.xml | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/06633cc1/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 0387bd6..6008b51 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.14 - ???
 
+  * Upgrade to PDFBox 2.0.2 (TIKA-1996).
+
   * Add configurable maximum threshold for number of events extracted
     from the XMP Media Management Schema in JempboxExtractor (TIKA-1999).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/06633cc1/tika-parsers/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index fec6449..a126eed 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -42,7 +42,7 @@
     <tukaani.version>1.5</tukaani.version>
     <mime4j.version>0.7.2</mime4j.version>
     <vorbis.version>0.8</vorbis.version>
-    <pdfbox.version>2.0.1</pdfbox.version>
+    <pdfbox.version>2.0.2</pdfbox.version>
     <jempbox.version>1.8.12</jempbox.version>
     <netcdf-java.version>4.5.5</netcdf-java.version>
     <cxf.version>3.0.3</cxf.version>


[2/7] tika git commit: TIKA-1999 small fix and update CHANGES.txt

Posted by ta...@apache.org.
TIKA-1999 small fix and update CHANGES.txt


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/99aa587d
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/99aa587d
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/99aa587d

Branch: refs/heads/TIKA-1508
Commit: 99aa587d171207c0c557ce65397f767d6a42cdfd
Parents: 3e14505
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 8 13:46:29 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 8 13:46:29 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                                     | 3 +++
 .../java/org/apache/tika/parser/image/xmp/JempboxExtractor.java | 5 ++---
 2 files changed, 5 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/99aa587d/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 08cd8ff..0387bd6 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.14 - ???
 
+  * Add configurable maximum threshold for number of events extracted
+    from the XMP Media Management Schema in JempboxExtractor (TIKA-1999).
+
   * Integrate TesseractOCR with full page image rendering for PDFs (TIKA-1994).
 
   * Add mime detection via Nick C and parser for DBF files (TIKA-1513).

http://git-wip-us.apache.org/repos/asf/tika/blob/99aa587d/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
index d9ae71d..6d5038a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/xmp/JempboxExtractor.java
@@ -43,14 +43,13 @@ import org.xml.sax.SAXException;
 public class JempboxExtractor {
 
 
-    private static int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
+    private static volatile int MAX_EVENT_HISTORY_IN_XMPMM = 1024;
 
     // The XMP spec says it must be unicode, but for most file formats it specifies "must be encoded in UTF-8"
     private static final String DEFAULT_XMP_CHARSET = UTF_8.name();
 
     private XMPPacketScanner scanner = new XMPPacketScanner();
     private Metadata metadata;
-    private static int maxXMPMMHistory;
 
     public JempboxExtractor(Metadata metadata) {
         this.metadata = metadata;
@@ -228,6 +227,6 @@ public class JempboxExtractor {
      * @return maximum number of events to extract from the XMPMM history.
      */
     public static int getMaxXMPMMHistory() {
-        return maxXMPMMHistory;
+        return MAX_EVENT_HISTORY_IN_XMPMM;
     }
 }


[5/7] tika git commit: Start factoring out "configurable"; change signature of ParseContext's setParam to (Class, Param); add check for illegal field being specified in TikaConfig.

Posted by ta...@apache.org.
Start factoring out "configurable"; change signature of ParseContext's setParam to (Class, Param); add check for illegal field being specified in TikaConfig.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/338db905
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/338db905
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/338db905

Branch: refs/heads/TIKA-1508
Commit: 338db905d4e203d4df4582d5511242eaa922af6b
Parents: ecdc403
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 13 11:14:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 13 11:14:27 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/config/TikaConfig.java | 12 ++---
 .../org/apache/tika/parser/AbstractParser.java  | 24 +---------
 .../org/apache/tika/parser/ParseContext.java    | 46 +++++++++++++-------
 .../org/apache/tika/utils/AnnotationUtils.java  | 24 +++++++---
 .../tika/parser/ConfigurableParserTest.java     |  3 ++
 .../tika/parser/DummyConfigurableParser.java    |  6 +--
 .../tika/parser/DummyParameterizedParser.java   |  3 +-
 .../tika/parser/ParameterizedParserTest.java    |  1 -
 .../org/apache/tika/parser/pdf/PDFParser.java   | 16 ++++---
 .../apache/tika/parser/pdf/PDFParserTest.java   | 19 +++++++-
 10 files changed, 86 insertions(+), 68 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 853cdf0..692b007 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -567,15 +567,9 @@ public class TikaConfig {
                 // Have any decoration performed, eg explicit mimetypes
                 loaded = decorate(loaded, element);
                 //if the instance is configurable, then call configure()
-                if (loaded instanceof Configurable){
-                    Map<String, Param<?>> params = getParams(element);
-                    //Assigning the params to bean fields/setters
-                    AnnotationUtils.assignFieldParams(loaded, params);
-                    //invoking the configure() hook
-                    ParseContext context = new ParseContext();
-                    context.getParams().putAll(params);
-                    ((Configurable) loaded).configure(context); // initialize here
-                }
+                Map<String, Param<?>> params = getParams(element);
+                //Assigning the params to bean fields/setters
+                AnnotationUtils.assignFieldParams(loaded, params);
                 // All done with setup
                 return loaded;
             } catch (ClassNotFoundException e) {

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
index 5c045db..51687e7 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
@@ -34,7 +34,7 @@ import org.xml.sax.SAXException;
  *
  * @since Apache Tika 0.10
  */
-public abstract class AbstractParser implements ConfigurableParser {
+public abstract class AbstractParser implements Parser {
 
     /**
      * Configuration supplied at runtime
@@ -62,27 +62,5 @@ public abstract class AbstractParser implements ConfigurableParser {
         parse(stream, handler, metadata, new ParseContext());
     }
 
-    /**
-     * called by the framework to supply runtime parameters which may be
-     * required for initialization
-     * @param context the parser context at runtime
-     * @since Apache Tika 1.14
-     */
-    @Override
-    public void configure(ParseContext context) throws TikaConfigException {
-        this.context = context;
-    }
-
-
-    /**
-     * Gets Parameters of this configurable instance
-     * @return a map of key value pairs
-     *
-     * @since Apache Tika 1.14
-     */
-    @Override
-    public Map<String, Param<?>> getParams() {
-        return this.context.getParams();
-    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index dc03099..68d5038 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@ -29,6 +29,7 @@ import java.io.IOException;
 import java.io.Serializable;
 import java.io.StringReader;
 import java.lang.reflect.Method;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
 
@@ -55,10 +56,12 @@ public class ParseContext implements Serializable {
     /** Map of objects in this context */
     private final Map<String, Object> context = new HashMap<String, Object>();
 
+    private final static Map<String, Param<?>> EMPTY_PARAMS = Collections.EMPTY_MAP;
+
     /**
      * Map of configurable arguments
      */
-    private final Map<String, Param<?>> params = new HashMap<>();
+    private final Map<String, Map<String, Param<?>>> params = new HashMap<>();
 
     private static final EntityResolver IGNORING_SAX_ENTITY_RESOLVER = new EntityResolver() {
         public InputSource resolveEntity(String publicId, String systemId) throws SAXException, IOException {
@@ -202,29 +205,42 @@ public class ParseContext implements Serializable {
     }
 
     /**
-     * Stores a key=value parameter
-     * @param key parameter name
+     * @param clazz class associated with given param name
      * @param value value
      */
-    public void setParam(String key, Param<?> value){
-        this.params.put(key, value);
+    public void setParam(Class clazz, Param<?> value){
+        Map<String, Param<?>> classParams = this.params.get(clazz.getName());
+        if (classParams == null) {
+            classParams = new HashMap<>();
+        }
+        classParams.put(value.getName(), value);
+        this.params.put(clazz.getName(), classParams);
     }
 
     /**
-     * Gets the value associated with given parameter
+     * Gets the value associated with given class and parameter
+     * @param clazz class
      * @param key parameter name
-     * @return param value
+     * @return param value or null if the clazz or key doesn't exist
      */
-    public Param<?> getParam(String key){
-        return this.params.get(key);
+    public Param<?> getParam(Class clazz, String key) {
+        Map<String, Param<?>> classParams = this.params.get(clazz.getName());
+        if (classParams != null) {
+            return classParams.get(key);
+        }
+        return null;
     }
 
     /**
-     * Gets all the params
-     * @return map of key values
+     * Gets all the params for the specified class
+     * @param clazz class for which to grab the params
+     * @return map of key values or null if nothing has been specified
      */
-    public Map<String, Param<?>> getParams() {
-        return params;
+    public Map<String, Param<?>> getParams(Class clazz) {
+        if (params.containsKey(clazz.getName())) {
+            return params.get(clazz.getName());
+        }
+        return EMPTY_PARAMS;
     }
 
     /**
@@ -232,8 +248,8 @@ public class ParseContext implements Serializable {
      * @param key parameter name
      * @return true if parameter is available, false otherwise
      */
-    public boolean hasParam(String key){
-       return params.containsKey(key);
+    public boolean hasParam(Class clazz, String key){
+       return params.containsKey(clazz) && params.get(clazz.getName()).containsKey(key);
     }
     /**
      * Returns the DOM builder factory specified in this parsing context.

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
index 08e004b..1f56bc7 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
@@ -26,11 +26,7 @@ import java.lang.annotation.Annotation;
 import java.lang.reflect.AccessibleObject;
 import java.security.AccessController;
 import java.security.PrivilegedAction;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 
 /**
  * This class contains utilities for dealing with tika annotations
@@ -100,7 +96,11 @@ public class AnnotationUtils {
         }
 
         List<ParamField> fields = PARAM_INFO.get(beanClass);
+
+        Set<String> validFieldNames = new HashSet<>();
+
         for (ParamField field : fields) {
+            validFieldNames.add(field.getName());
             Param<?> param = params.get(field.getName());
             if (param != null){
                 if (field.getType().isAssignableFrom(param.getType())) {
@@ -110,7 +110,7 @@ public class AnnotationUtils {
                         throw new TikaConfigException(e.getMessage(), e);
                     }
                 } else {
-                    String msg = String.format("Value '%s' of type '%s' cant be" +
+                    String msg = String.format(Locale.ROOT, "Value '%s' of type '%s' cant be" +
                             " assigned to field '%s' of defined type '%s'",
                             param.getValue(), param.getValue().getClass(),
                             field.getName(), field.getType());
@@ -118,7 +118,7 @@ public class AnnotationUtils {
                 }
             } else if (field.isRequired()){
                 //param not supplied but field is declared as required?
-                String msg = String.format("Param %s is required for %s," +
+                String msg = String.format(Locale.ROOT, "Param %s is required for %s," +
                         " but it is not given in config.", field.getName(),
                         bean.getClass().getName());
                 throw new TikaConfigException(msg);
@@ -127,5 +127,15 @@ public class AnnotationUtils {
                 //LOG.debug("Param not supplied, field is not mandatory");
             }
         }
+        //now test that params doesn't contain a field
+        //not allowed by this object
+        for (String fieldName : params.keySet()) {
+            if (! validFieldNames.contains(fieldName)) {
+                String msg = String.format(Locale.ROOT,
+                        "No field '%s' exists for %s",
+                        fieldName, bean.getClass().getName());
+                throw new TikaConfigException(msg);
+            }
+        }
     }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
index dcf188d..ffb632c 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ConfigurableParserTest.java
@@ -20,6 +20,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.junit.Assert;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import java.io.File;
@@ -36,6 +37,7 @@ public class ConfigurableParserTest {
     public static final String TEST_PARAM_VAL = "testparamval";
 
     @Test
+    @Ignore
     public void testConfigurableParser() throws Exception {
         URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
         assert configFileUrl != null;
@@ -48,6 +50,7 @@ public class ConfigurableParserTest {
     }
 
     @Test
+    @Ignore
     public void testConfigurableParserTypes() throws Exception {
         URL configFileUrl = getClass().getClassLoader().getResource(TIKA_CFG_FILE);
         assert configFileUrl != null;

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
index 3914b01..15fe060 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyConfigurableParser.java
@@ -39,8 +39,8 @@ import java.util.Set;
  * 3. parameters were available at parse
  *
  */
-public class DummyConfigurableParser extends AbstractParser {
-
+public class DummyConfigurableParser {
+/*
     private static Set<MediaType> MIMES = new HashSet<>();
     static {
         MIMES.add(MediaType.TEXT_PLAIN);
@@ -63,5 +63,5 @@ public class DummyConfigurableParser extends AbstractParser {
             metadata.add(entry.getKey()+"-type", param.getValue().getClass().getName());
         }
     }
-
+*/
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
index 848b774..801d65e 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
@@ -40,8 +40,7 @@ import static org.osgi.util.measurement.Unit.s;
  * A test Parsers to test {@link Field}
  * @since Apache Tika 1.14
  */
-public class DummyParameterizedParser extends AbstractParser
-        implements ConfigurableParser {
+public class DummyParameterizedParser extends AbstractParser {
 
     private static Set<MediaType> MIMES = new HashSet<>();
     static {

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
index e0c3b53..a048f29 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
@@ -77,7 +77,6 @@ public class ParameterizedParserTest {
     }
 
     @Test
-    @Ignore("can we get this to work, somehow?")
     public void testBadParam() throws Exception {
         try {
             Metadata m = getMetadata("TIKA-1986-bad-parameters.xml");

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index bacc901..dd03177 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -21,12 +21,7 @@ import javax.xml.stream.XMLStreamException;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.Arrays;
-import java.util.Calendar;
-import java.util.Collections;
-import java.util.List;
-import java.util.Locale;
-import java.util.Set;
+import java.util.*;
 
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.jempbox.xmp.XMPMetadata;
@@ -44,6 +39,7 @@ import org.apache.pdfbox.pdmodel.common.PDMetadata;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
 import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
 import org.apache.tika.config.Field;
+import org.apache.tika.config.Param;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -86,7 +82,7 @@ import static org.bouncycastle.asn1.x500.style.RFC4519Style.name;
  * turn this feature on, see
  * {@link PDFParserConfig#setExtractInlineImages(boolean)}.
  */
-public class PDFParser extends AbstractParser implements ConfigurableParser {
+public class PDFParser extends AbstractParser {
 
 
     /**
@@ -123,6 +119,12 @@ public class PDFParser extends AbstractParser implements ConfigurableParser {
         PDFParserConfig localConfig = context.get(PDFParserConfig.class, defaultConfig);
         //TODO: get rid of this after dev of TIKA-1508!!!
         localConfig.setSortByPosition(sortByPosition);
+
+        //TODO: this is just a mockup...move elsewhere
+        Map<String, Param<?>> params = context.getParams(PDFParser.class);
+        if (params != null && params.containsKey("sortByPosition")) {
+            localConfig.setSortByPosition((Boolean)params.get("sortByPosition").getValue());
+        }
         String password = "";
         try {
             // PDFBox can process entirely in memory, or can use a temp file

http://git-wip-us.apache.org/repos/asf/tika/blob/338db905/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ac54b11..2ef29f3 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -35,6 +35,7 @@ import org.apache.commons.io.IOUtils;
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.Param;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.AccessPermissionException;
 import org.apache.tika.exception.EncryptedDocumentException;
@@ -469,7 +470,7 @@ public class PDFParserTest extends TikaTest {
         content = content.replaceAll("\\s+", " ");
         assertContains("Left column line 1 Left column line 2 Right column line 1 Right column line 2", content);
 
-        parser.getPDFParserConfig().setSortByPosition(true);
+        parser.setSortByPosition(true);
         stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf");
         content = getText(stream, parser);
         content = content.replaceAll("\\s+", " ");
@@ -1229,6 +1230,22 @@ public class PDFParserTest extends TikaTest {
 
     }
 
+    @Test
+    public void testParameterizationViaContext() throws Exception {
+        ParseContext context = new ParseContext();
+
+        Param<Boolean> paramVal = new Param<>("sortByPosition", new Boolean(true));
+        context.setParam(PDFParser.class, paramVal);
+
+        Parser p = new AutoDetectParser();
+        String text = getText(getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"), p, context);
+        text = text.replaceAll("\\s+", " ");
+
+        // Column text is now interleaved:
+        assertContains("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2", text);
+
+    }
+
     private void assertException(String path, Parser parser, ParseContext context, Class expected) {
         boolean noEx = false;
         InputStream is = getResourceAsStream(path);


[6/7] tika git commit: Merge remote-tracking branch 'origin/TIKA-1508' into TIKA-1508

Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/TIKA-1508' into TIKA-1508

# Conflicts:
#	tika-core/src/main/java/org/apache/tika/config/TikaConfig.java


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/21408588
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/21408588
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/21408588

Branch: refs/heads/TIKA-1508
Commit: 2140858840af8f1c015f3570dc4ac8d2bb4405cf
Parents: 338db90 e48d191
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 13 11:16:34 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 13 11:16:34 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/config/TikaConfig.java |  6 +--
 .../tika/parser/ParameterizedParserTest.java    |  9 +++++
 .../TIKA-1986-parameterized-decorated.xml       | 39 ++++++++++++++++++++
 3 files changed, 51 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/21408588/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --cc tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 692b007,1163d84..84fd636
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@@ -563,13 -563,19 +563,13 @@@ public class TikaConfig 
                      // TODO Support arguments, needed for Translators etc
                      // See the thread "Configuring parsers and translators" for details 
                  }
-                 
-                 // Have any decoration performed, eg explicit mimetypes
-                 loaded = decorate(loaded, element);
+ 
                  //if the instance is configurable, then call configure()
 -                if (loaded instanceof Configurable){
 -                    Map<String, Param<?>> params = getParams(element);
 -                    //Assigning the params to bean fields/setters
 -                    AnnotationUtils.assignFieldParams(loaded, params);
 -                    //invoking the configure() hook
 -                    ParseContext context = new ParseContext();
 -                    context.getParams().putAll(params);
 -                    ((Configurable) loaded).configure(context); // initialize here
 -                }
 +                Map<String, Param<?>> params = getParams(element);
 +                //Assigning the params to bean fields/setters
 +                AnnotationUtils.assignFieldParams(loaded, params);
+                 // Have any decoration performed, eg explicit mimetypes
+                 loaded = decorate(loaded, element);
                  // All done with setup
                  return loaded;
              } catch (ClassNotFoundException e) {

http://git-wip-us.apache.org/repos/asf/tika/blob/21408588/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
----------------------------------------------------------------------


[4/7] tika git commit: Merge remote-tracking branch 'origin/master' into TIKA-1508

Posted by ta...@apache.org.
Merge remote-tracking branch 'origin/master' into TIKA-1508


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/ecdc4035
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/ecdc4035
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/ecdc4035

Branch: refs/heads/TIKA-1508
Commit: ecdc403578d2a2b8fb70f66b7df1ece96b5efa9c
Parents: 853750d 06633cc
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 13 09:25:24 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 13 09:25:24 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   5 +
 tika-parsers/pom.xml                            |   2 +-
 .../tika/parser/image/xmp/JempboxExtractor.java |  30 ++++
 .../parser/image/xmp/JempboxExtractorTest.java  |  29 ++-
 .../test/resources/test-documents/testXMP.xmp   | 178 +++++++++++++++++++
 5 files changed, 242 insertions(+), 2 deletions(-)
----------------------------------------------------------------------