You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/24 18:51:18 UTC

[tika] branch branch_1x updated (d5af2cf -> 3dc7db7)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from d5af2cf  TIKA-3045 -- Added XMLProfiler as an optional parser to profile XFA and XMP in PDFs
     new 750d779  TIKA-3050 -- add xmp extraction from PSD files
     new d9d42aa  bump spring to avoid vulnerable code: https://ossindex.sonatype.org/vuln/fe1be8c0-575d-49bc-906d-582e1dd589dd
     new e8627e5  Upgrade to POI 4.1.2 (TIKA-3047).
     new 5c13f69  Upgrade to PDFBox 2.0.19 (TIKA-3033).
     new 2f56935  TIKA-2952 -- Upgrade metadata-extractor to 2.13.0
     new 3dc7db7  TIKA-3056 -- general upgrades for the 1.24 release.

The 6 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   8 ++-
 tika-bundle/pom.xml                                |   3 +
 .../apache/tika/metadata/TikaCoreProperties.java   |   4 +-
 tika-dl/pom.xml                                    |  10 ++--
 tika-eval/pom.xml                                  |   6 ++
 tika-example/pom.xml                               |   6 +-
 tika-nlp/pom.xml                                   |   2 +-
 tika-parent/pom.xml                                |  19 +++----
 tika-parsers/pom.xml                               |  58 ++++++++++++++-----
 .../org/apache/tika/parser/image/PSDParser.java    |  63 +++++++++++++++++----
 .../ooxml/xwpf/XWPFEventBasedWordExtractor.java    |   2 +-
 .../apache/tika/parser/image/PSDParserTest.java    |  15 ++++-
 .../test/resources/test-documents/testPSD_xmp.psd  | Bin 0 -> 114796 bytes
 tika-translate/pom.xml                             |   4 +-
 tika-xmp/pom.xml                                   |   6 +-
 15 files changed, 156 insertions(+), 50 deletions(-)
 create mode 100644 tika-parsers/src/test/resources/test-documents/testPSD_xmp.psd


[tika] 04/06: Upgrade to PDFBox 2.0.19 (TIKA-3033).

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5c13f69c0b31ccc08b48f0360fe9b2fc0df9086e
Author: tallison <ta...@apache.org>
AuthorDate: Mon Feb 24 12:00:56 2020 -0500

    Upgrade to PDFBox 2.0.19 (TIKA-3033).
---
 CHANGES.txt          | 2 +-
 tika-parsers/pom.xml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 2e7216e..b6a1bf1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -8,7 +8,7 @@ Release 1.24 - ???
 
    * Extract inline images that rely on the DCT filter from PDFs (TIKA-3041).
 
-   * Upgrade to PDFBox 2.0.18 (TIKA-3021).
+   * Upgrade to PDFBox 2.0.19 (TIKA-3033).
 
    * Fix bug in ASM parser configuration (TIKA-2992).
    
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 0a3d2a6..7874f4b 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -43,7 +43,7 @@
     <brotli.version>0.1.2</brotli.version>
     <mime4j.version>0.8.3</mime4j.version>
     <vorbis.version>0.8</vorbis.version>
-    <pdfbox.version>2.0.18</pdfbox.version>
+    <pdfbox.version>2.0.19</pdfbox.version>
     <jempbox.version>1.8.16</jempbox.version>
     <netcdf-java.version>4.5.5</netcdf-java.version>
     <sis.version>1.0</sis.version>
@@ -880,7 +880,7 @@
     <dependency>
       <groupId>org.apache.pdfbox</groupId>
       <artifactId>jbig2-imageio</artifactId>
-      <version>3.0.2</version>
+      <version>3.0.3</version>
     </dependency>
 
     <!-- jai-imageio-core is allowed since LEGAL-304 -->


[tika] 03/06: Upgrade to POI 4.1.2 (TIKA-3047).

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e8627e5081a9bc94657d2b20c5cdd51f5f14c105
Author: tallison <ta...@apache.org>
AuthorDate: Mon Feb 24 11:56:45 2020 -0500

    Upgrade to POI 4.1.2 (TIKA-3047).
---
 CHANGES.txt                                                             | 1 +
 tika-bundle/pom.xml                                                     | 1 +
 tika-parent/pom.xml                                                     | 2 +-
 .../tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java   | 2 +-
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index c17ba5d..2e7216e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 Release 1.24 - ???
+   * Upgrade to POI 4.1.2 (TIKA-3047).
 
    * Extract XMP from PSD files (TIKA-3050).
 
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 9550fd7..1c0932a 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -255,6 +255,7 @@
               com.sun.msv.datatype;resolution:=optional,
               com.sun.msv.datatype.xsd;resolution:=optional,
               com.sun.tools.javadoc;resolution:=optional,
+              com.zaxxer.sparsebits;resolution:=optional,
               edu.mit.ll.mitie;resolution:=optional,
               edu.stanford.nlp.*;resolution:=optional,
               edu.wisc.ssec.mcidas;resolution:=optional,
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 0f42b2d..147ef44 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -334,7 +334,7 @@
     <maven.shade.version>3.2.1</maven.shade.version>
     <rat.version>0.13</rat.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
-    <poi.version>4.1.1</poi.version>
+    <poi.version>4.1.2</poi.version>
     <commons.compress.version>1.19</commons.compress.version>
     <commons.io.version>2.6</commons.io.version>
     <commons.lang3.version>3.9</commons.lang3.version>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index ec63704..03d64f5 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -234,7 +234,7 @@ public class XWPFEventBasedWordExtractor extends POIXMLTextExtractor {
                 }
                 return new XWPFNumbering(numberingPart);
             }
-        } catch (IOException | OpenXML4JException e) {
+        } catch (OpenXML4JException e) {
             LOG.warn("Couldn't load numbering", e);
         }
         return null;


[tika] 02/06: bump spring to avoid vulnerable code: https://ossindex.sonatype.org/vuln/fe1be8c0-575d-49bc-906d-582e1dd589dd

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d9d42aa116699967106ecf0e83ae5d2e355570c6
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 21 13:33:47 2020 -0500

    bump spring to avoid vulnerable code: https://ossindex.sonatype.org/vuln/fe1be8c0-575d-49bc-906d-582e1dd589dd
---
 tika-example/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tika-example/pom.xml b/tika-example/pom.xml
index dd813a9..422000d 100644
--- a/tika-example/pom.xml
+++ b/tika-example/pom.xml
@@ -152,7 +152,7 @@
     <dependency>
       <groupId>org.springframework</groupId>
       <artifactId>spring-context</artifactId>
-      <version>5.2.1.RELEASE</version>
+      <version>5.2.3.RELEASE</version>
       <exclusions>
         <exclusion>
           <groupId>commons-logging</groupId>


[tika] 06/06: TIKA-3056 -- general upgrades for the 1.24 release.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 3dc7db726f2bdca4753f7486ca450266940554e2
Author: tallison <ta...@apache.org>
AuthorDate: Mon Feb 24 13:32:28 2020 -0500

    TIKA-3056 -- general upgrades for the 1.24 release.
---
 tika-dl/pom.xml        | 10 +++++-----
 tika-eval/pom.xml      |  6 ++++++
 tika-example/pom.xml   |  4 ++++
 tika-nlp/pom.xml       |  2 +-
 tika-parent/pom.xml    | 17 ++++++++---------
 tika-parsers/pom.xml   | 46 +++++++++++++++++++++++++++++++++++++---------
 tika-translate/pom.xml |  4 ++--
 7 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/tika-dl/pom.xml b/tika-dl/pom.xml
index dcf1d9b..dff0d99 100644
--- a/tika-dl/pom.xml
+++ b/tika-dl/pom.xml
@@ -36,8 +36,8 @@
 
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
-    <dl4j.version>1.0.0-beta5</dl4j.version>
-    <twelvemonkeys.version>3.4.2</twelvemonkeys.version>
+    <dl4j.version>1.0.0-beta6</dl4j.version>
+    <twelvemonkeys.version>3.5</twelvemonkeys.version>
   </properties>
 
   <dependencies>
@@ -282,7 +282,7 @@
     <dependency>
       <groupId>org.objenesis</groupId>
       <artifactId>objenesis</artifactId>
-      <version>3.0.1</version>
+      <version>3.1</version>
     </dependency>
     <dependency>
       <groupId>org.nd4j</groupId>
@@ -341,7 +341,7 @@
     <dependency>
       <groupId>org.projectlombok</groupId>
       <artifactId>lombok</artifactId>
-      <version>1.18.8</version>
+      <version>1.18.12</version>
     </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
@@ -351,7 +351,7 @@
     <dependency>
       <groupId>joda-time</groupId>
       <artifactId>joda-time</artifactId>
-      <version>2.10.3</version>
+      <version>2.10.5</version>
     </dependency>
     <dependency>
       <groupId>commons-io</groupId>
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index 3b3f4f7..d3b7206 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -114,6 +114,12 @@
             <groupId>org.apache.poi</groupId>
             <artifactId>poi-ooxml</artifactId>
             <version>${poi.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.apache.commons</groupId>
+                    <artifactId>commons-compress</artifactId>
+                </exclusion>
+            </exclusions>
         </dependency>
         <dependency>
             <groupId>org.apache.poi</groupId>
diff --git a/tika-example/pom.xml b/tika-example/pom.xml
index 422000d..5215ffc 100644
--- a/tika-example/pom.xml
+++ b/tika-example/pom.xml
@@ -118,6 +118,10 @@
           <groupId>org.apache.httpcomponents</groupId>
           <artifactId>httpclient</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.httpcomponents</groupId>
+          <artifactId>httpcore</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
diff --git a/tika-nlp/pom.xml b/tika-nlp/pom.xml
index eeedd78..d8acb4f 100644
--- a/tika-nlp/pom.xml
+++ b/tika-nlp/pom.xml
@@ -202,7 +202,7 @@
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-databind</artifactId>
-      <version>${jackson.databind.version}</version>
+      <version>${jackson.version}</version>
       <exclusions>
         <exclusion>
           <groupId>com.fasterxml.jackson.core</groupId>
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 147ef44..95c9d47 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -269,7 +269,7 @@
       <dependency>
         <groupId>junit</groupId>
         <artifactId>junit</artifactId>
-        <version>4.13-rc-1</version>
+        <version>4.13</version>
         <scope>test</scope>
       </dependency>
       <dependency>
@@ -335,24 +335,23 @@
     <rat.version>0.13</rat.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
     <poi.version>4.1.2</poi.version>
-    <commons.compress.version>1.19</commons.compress.version>
+    <commons.compress.version>1.20</commons.compress.version>
     <commons.io.version>2.6</commons.io.version>
     <commons.lang3.version>3.9</commons.lang3.version>
     <gson.version>2.8.6</gson.version>
-    <guava.version>28.1-jre</guava.version>
+    <guava.version>28.2-jre</guava.version>
     <osgi.core.version>6.0.0</osgi.core.version>
 
-    <cxf.version>3.3.4</cxf.version>
+    <cxf.version>3.3.5</cxf.version>
     <slf4j.version>1.7.28</slf4j.version>
-    <jackson.version>2.10.1</jackson.version>
-    <jackson.databind.version>2.10.1</jackson.databind.version>
+    <jackson.version>2.10.2</jackson.version>
     <!-- when this is next upgraded, see if we can get rid of
          javax.activation dependency in tika-server -->
     <jaxb.version>2.3.2</jaxb.version>
     <cli.version>1.4</cli.version>
-    <lucene.version>8.3.0</lucene.version>
-    <mockito.version>3.1.0</mockito.version>
-    <opennlp.version>1.9.1</opennlp.version>
+    <lucene.version>8.4.1</lucene.version>
+    <mockito.version>3.3.0</mockito.version>
+    <opennlp.version>1.9.2</opennlp.version>
   </properties>
 
   <build>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 6ea2463..3a2498f 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -36,6 +36,10 @@
 
   <properties>
     <!-- NOTE: sync codec version with POI -->
+    <!-- 1.14 changed base64 decoding to "strict"  which caused a failure
+    in XML2003ParserTest#testBasicWord see:
+    https://issues.apache.org/jira/browse/CODEC-263?focusedCommentId=17043716&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17043716
+    -->
     <codec.version>1.13</codec.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
     <tukaani.version>1.8</tukaani.version>
@@ -51,7 +55,7 @@
     <!-- used by POI, PDFBox and Jackcess ...try to sync -->
     <bouncycastle.version>1.64</bouncycastle.version>
     <commonsexec.version>1.3</commonsexec.version>
-    <httpcomponents.version>4.5.10</httpcomponents.version>
+    <httpcomponents.version>4.5.11</httpcomponents.version>
   </properties>
 
   <!-- Note: as of the 1.23 release, we're waiting for external changes or fixes
@@ -178,7 +182,7 @@
     <dependency>
       <groupId>com.github.luben</groupId>
       <artifactId>zstd-jni</artifactId>
-      <version>1.4.4-3</version>
+      <version>1.4.4-7</version>
       <scope>provided</scope>
     </dependency>
 
@@ -260,6 +264,10 @@
           <groupId>xml-apis</groupId>
           <artifactId>xml-apis</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.commons</groupId>
+          <artifactId>commons-compress</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>
@@ -306,7 +314,7 @@
     <dependency>
       <groupId>org.ow2.asm</groupId>
       <artifactId>asm</artifactId>
-      <version>7.2</version>
+      <version>7.3.1</version>
     </dependency>
     <dependency>
       <groupId>com.googlecode.mp4parser</groupId>
@@ -408,7 +416,7 @@
     <dependency>
       <groupId>org.xerial</groupId>
       <artifactId>sqlite-jdbc</artifactId>
-      <version>3.28.0</version>
+      <version>3.30.1</version>
       <scope>provided</scope>
     </dependency>
 
@@ -543,6 +551,12 @@
         </exclusion>
       </exclusions>
     </dependency>
+    <!-- TIKA-3052 -->
+    <dependency>
+      <groupId>com.beust</groupId>
+      <artifactId>jcommander</artifactId>
+      <version>1.78</version>
+    </dependency>
     <!--TIKA 2672 include a later version of jna as a direct dependency to resolve dependency convergence with tika-dl's
     deeplearning4j-nn:1.0.0-SNAPSHOT -->
     <dependency>
@@ -556,12 +570,12 @@
     <dependency>
       <groupId>org.jsoup</groupId>
       <artifactId>jsoup</artifactId>
-      <version>1.12.1</version>
+      <version>1.12.2</version>
     </dependency>
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>3.11.0</version>
+      <version>3.11.4</version>
     </dependency>
     <dependency>
       <groupId>edu.ucar</groupId>
@@ -600,6 +614,10 @@
           <groupId>org.quartz-scheduler</groupId>
           <artifactId>quartz</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>com.beust</groupId>
+          <artifactId>jcommander</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <!--needs to be bumped for xml vulnerability -->
@@ -607,13 +625,23 @@
       <groupId>org.quartz-scheduler</groupId>
       <artifactId>quartz</artifactId>
       <version>2.3.2</version>
+      <exclusions>
+        <exclusion>
+          <groupId>com.mchange</groupId>
+          <artifactId>c3p0</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.mchange</groupId>
+          <artifactId>mchange-commons-java</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <!-- needs to be excluded and version bumped
          to avoid billion laughs vuln -->
     <dependency>
       <groupId>com.mchange</groupId>
       <artifactId>c3p0</artifactId>
-      <version>0.9.5.4</version>
+      <version>0.9.5.5</version>
     </dependency>
     <dependency>
       <groupId>edu.ucar</groupId>
@@ -664,7 +692,7 @@
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-csv</artifactId>
-      <version>1.7</version>
+      <version>1.8</version>
     </dependency>
 
     <dependency>
@@ -866,7 +894,7 @@
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-databind</artifactId>
-      <version>${jackson.databind.version}</version>
+      <version>${jackson.version}</version>
       <exclusions>
         <exclusion>
           <groupId>com.fasterxml.jackson.core</groupId>
diff --git a/tika-translate/pom.xml b/tika-translate/pom.xml
index b65c75f..781bc2f 100644
--- a/tika-translate/pom.xml
+++ b/tika-translate/pom.xml
@@ -79,7 +79,7 @@
     <dependency>
       <groupId>com.fasterxml.jackson.jaxrs</groupId>
       <artifactId>jackson-jaxrs-json-provider</artifactId>
-      <version>${jackson.version}</version>
+      <version>2.10.2.1</version>
       <exclusions>
         <!-- exclude because, as of 2.9.5, jaxrs-json-provider
         is bringing in 2.9.0 of core's annotations
@@ -102,7 +102,7 @@
     <dependency>
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-databind</artifactId>
-      <version>${jackson.databind.version}</version>
+      <version>${jackson.version}</version>
       <exclusions>
         <exclusion>
           <groupId>com.fasterxml.jackson.core</groupId>


[tika] 05/06: TIKA-2952 -- Upgrade metadata-extractor to 2.13.0

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2f5693541a7df99f9693837371bc43f51e46f702
Author: tallison <ta...@apache.org>
AuthorDate: Mon Feb 24 12:20:08 2020 -0500

    TIKA-2952 -- Upgrade metadata-extractor to 2.13.0
---
 CHANGES.txt          | 3 +++
 tika-bundle/pom.xml  | 2 ++
 tika-parsers/pom.xml | 8 ++++++--
 tika-xmp/pom.xml     | 6 +++---
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index b6a1bf1..7eb54d2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,7 @@
 Release 1.24 - ???
+
+   * Upgrade metadata-extractor to 2.13.0 (TIKA-2952).
+
    * Upgrade to POI 4.1.2 (TIKA-3047).
 
    * Extract XMP from PSD files (TIKA-3050).
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 1c0932a..07f43fa 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -236,6 +236,8 @@
               org.apache.tika.fork,
               android.util;resolution:=optional,
               com.adobe.xmp;resolution:=optional,
+              com.adobe.xmp.impl;resolution:=optional,
+              com.adobe.xmp.options;resolution:=optional,
               com.adobe.xmp.properties;resolution:=optional,
               com.github.luben.zstd;resolution:=optional,
               com.github.openjson;resolution:=optional,
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 7874f4b..6ea2463 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -313,10 +313,14 @@
       <artifactId>isoparser</artifactId>
       <version>1.1.22</version>
     </dependency>
+    <!-- this is a fork of com.drewnoakes
+      metadata extractor that shade/relocates com.adobe.internal
+      to com.adobe for backwards compatibility
+    -->
     <dependency>
-      <groupId>com.drewnoakes</groupId>
+      <groupId>org.tallison</groupId>
       <artifactId>metadata-extractor</artifactId>
-      <version>2.11.0</version>
+      <version>2.13.0</version>
     </dependency>
     <dependency>
       <groupId>de.l3s.boilerpipe</groupId>
diff --git a/tika-xmp/pom.xml b/tika-xmp/pom.xml
index 020d182..3920801 100644
--- a/tika-xmp/pom.xml
+++ b/tika-xmp/pom.xml
@@ -90,9 +90,9 @@
       <version>${project.version}</version>
     </dependency>
     <dependency>
-      <groupId>com.adobe.xmp</groupId>
-      <artifactId>xmpcore</artifactId>
-      <version>5.1.3</version>
+      <groupId>org.tallison.xmp</groupId>
+      <artifactId>xmpcore-shaded</artifactId>
+      <version>6.1.10</version>
     </dependency>
     <dependency>
       <groupId>junit</groupId>


[tika] 01/06: TIKA-3050 -- add xmp extraction from PSD files

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 750d7790c7d8ba766f2794fac79521d5870eadfb
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 21 13:27:13 2020 -0500

    TIKA-3050 -- add xmp extraction from PSD files
---
 CHANGES.txt                                        |   2 +
 .../apache/tika/metadata/TikaCoreProperties.java   |   4 +-
 .../org/apache/tika/parser/image/PSDParser.java    |  63 +++++++++++++++++----
 .../apache/tika/parser/image/PSDParserTest.java    |  15 ++++-
 .../test/resources/test-documents/testPSD_xmp.psd  | Bin 0 -> 114796 bytes
 5 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 6703627..c17ba5d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.24 - ???
 
+   * Extract XMP from PSD files (TIKA-3050).
+
    * Added XMLProfiler as an optional parser to profile XFA and XMP
      in PDFs (TIKA-3045).
 
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 5f63cae..5d02265 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -58,8 +58,8 @@ public interface TikaCoreProperties {
         ATTACHMENT,//standard attachment as in email
         MACRO, //any code that is intended to be run by the application
         METADATA, //e.g. xmp, xfa
-        FONT;//embedded font files
-        //what else?
+        FONT,//embedded font files
+        THUMBNAIL;//TODO: set this in parsers that handle thumbnails
     };
 
     /**
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
index 4d0510c..b78a366 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
@@ -16,16 +16,23 @@
  */
 package org.apache.tika.parser.image;
 
+import java.io.ByteArrayInputStream;
+import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 
-import org.apache.poi.util.IOUtils;
+import org.apache.commons.io.IOUtils;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.EndianUtils;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Photoshop;
 import org.apache.tika.metadata.TIFF;
@@ -33,6 +40,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -44,6 +52,9 @@ import static java.nio.charset.StandardCharsets.US_ASCII;
  * <p/>
  * Documentation on the file format is available from
  * http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/PhotoshopFileFormats.htm
+ *
+ * An MIT-licensed python parser with test files is:
+ * https://github.com/psd-tools/psd-tools
  */
 public class PSDParser extends AbstractParser {
 
@@ -56,6 +67,9 @@ public class PSDParser extends AbstractParser {
             Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
                     MediaType.image("vnd.adobe.photoshop"))));
 
+    private static final int MAX_DATA_LENGTH_BYTES = 1000000;
+    private static final int MAX_BLOCKS = 10000;
+
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
@@ -101,19 +115,27 @@ public class PSDParser extends AbstractParser {
 
         // Colour mode, eg Bitmap or RGB
         int colorMode = EndianUtils.readUShortBE(stream);
-        metadata.set(Photoshop.COLOR_MODE, Photoshop._COLOR_MODE_CHOICES_INDEXED[colorMode]);
+        if (colorMode < Photoshop._COLOR_MODE_CHOICES_INDEXED.length) {
+            metadata.set(Photoshop.COLOR_MODE, Photoshop._COLOR_MODE_CHOICES_INDEXED[colorMode]);
+        }
 
         // Next is the Color Mode section
         // We don't care about this bit
         long colorModeSectionSize = EndianUtils.readIntBE(stream);
-        stream.skip(colorModeSectionSize);
+        IOUtils.skipFully(stream, colorModeSectionSize);
 
         // Next is the Image Resources section
         // Check for certain interesting keys here
         long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
         long read = 0;
-        while (read < imageResourcesSectionSize) {
+        //if something is corrupt about this number, prevent an
+        //infinite loop by only reading 10000 blocks
+        int blocks = 0;
+        while (read < imageResourcesSectionSize && blocks < MAX_BLOCKS) {
             ResourceBlock rb = new ResourceBlock(stream);
+            if (rb.totalLength <= 0) {
+                //break;
+            }
             read += rb.totalLength;
 
             // Is it one we can do something useful with?
@@ -124,8 +146,12 @@ public class PSDParser extends AbstractParser {
             } else if (rb.id == ResourceBlock.ID_EXIF_3) {
                 // TODO Parse the EXIF info via ImageMetadataExtractor
             } else if (rb.id == ResourceBlock.ID_XMP) {
-                // TODO Parse the XMP info via ImageMetadataExtractor
+                //if there are multiple xmps in a file, this will
+                //overwrite the data from the earlier xmp
+                JempboxExtractor ex = new JempboxExtractor(metadata);
+                ex.parse(new ByteArrayInputStream(rb.data));
             }
+            blocks++;
         }
 
         // Next is the Layer and Mask Info
@@ -141,17 +167,21 @@ public class PSDParser extends AbstractParser {
     private static class ResourceBlock {
         private static final long SIGNATURE = 0x3842494d; // 8BIM
         private static final int ID_CAPTION = 0x03F0;
-        private static final int ID_URL = 0x040B;
         private static final int ID_EXIF_1 = 0x0422;
         private static final int ID_EXIF_3 = 0x0423;
         private static final int ID_XMP = 0x0424;
+        //TODO
+        private static final int ID_URL = 0x040B;
+        private static final int ID_AUTO_SAVE_FILE_PATH = 0x043E;
+        private static final int ID_THUMBNAIL_RESOURCE = 0x040C;
 
         private int id;
         private String name;
         private byte[] data;
         private int totalLength;
-
+        static int counter = 0;
         private ResourceBlock(InputStream stream) throws IOException, TikaException {
+            counter++;
             // Verify the signature
             long sig = EndianUtils.readIntBE(stream);
             if (sig != SIGNATURE) {
@@ -166,6 +196,9 @@ public class PSDParser extends AbstractParser {
             int nameLen = 0;
             while (true) {
                 int v = stream.read();
+                if (v < 0) {
+                    throw new EOFException();
+                }
                 nameLen++;
 
                 if (v == 0) {
@@ -182,16 +215,26 @@ public class PSDParser extends AbstractParser {
             }
 
             int dataLen = EndianUtils.readIntBE(stream);
+            if (dataLen < 0) {
+                throw new TikaException("data length must be >= 0: "+dataLen);
+            }
             if (dataLen % 2 == 1) {
                 // Data Length is even padded
                 dataLen = dataLen + 1;
             }
+            //protect against overflow
+            if (Integer.MAX_VALUE-dataLen < nameLen+10) {
+                throw new TikaException("data length is too long:"+dataLen);
+            }
             totalLength = 4 + 2 + nameLen + 4 + dataLen;
-
             // Do we have use for the data segment?
             if (captureData(id)) {
-               data = new byte[dataLen];
-               IOUtils.readFully(stream, data);
+                if (dataLen > MAX_DATA_LENGTH_BYTES) {
+                    throw new TikaException("data length must be < "+MAX_DATA_LENGTH_BYTES+
+                            ": "+dataLen);
+                }
+                data = new byte[dataLen];
+                IOUtils.readFully(stream, data);
             } else {
                 data = new byte[0];
                 IOUtils.skipFully(stream, dataLen);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
index 82ebe7b..f748ec5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
@@ -19,14 +19,20 @@ package org.apache.tika.parser.image;
 import static org.junit.Assert.assertEquals;
 
 import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
 
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.XMPMM;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.junit.Test;
 import org.xml.sax.helpers.DefaultHandler;
 
-public class PSDParserTest {
+public class PSDParserTest extends TikaTest {
 
     private final Parser parser = new PSDParser();
 
@@ -61,4 +67,11 @@ public class PSDParserTest {
         assertEquals("70", metadata.get(Metadata.IMAGE_LENGTH));
         assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
     }
+
+    @Test
+    public void testXMP() throws Exception {
+        Metadata metadata = getXML("testPSD_xmp.psd").metadata;
+        assertEquals("Adobe Photoshop CC 2014 (Macintosh)", metadata.get(XMPMM.HISTORY_SOFTWARE_AGENT));
+        assertEquals("xmp.iid:63681182-81a0-4035-b4b2-19bea6201c05", metadata.get(XMPMM.HISTORY_EVENT_INSTANCEID));
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testPSD_xmp.psd b/tika-parsers/src/test/resources/test-documents/testPSD_xmp.psd
new file mode 100644
index 0000000..707df93
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPSD_xmp.psd differ