You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/21 18:35:00 UTC

[tika] branch master updated (c61d0d3 -> d71d71d)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from c61d0d3  TIKA-3045 -- Added XMLProfiler as an optional parser to profile XFA and XMP in PDFs
     new ab8a9ed  TIKA-3050 -- add xmp extraction from PSD files
     new d71d71d  bump spring to avoid vulnerable code: https://ossindex.sonatype.org/vuln/fe1be8c0-575d-49bc-906d-582e1dd589dd

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   2 +
 .../apache/tika/metadata/TikaCoreProperties.java   |   4 +-
 tika-example/pom.xml                               |   2 +-
 .../org/apache/tika/parser/image/PSDParser.java    |  63 +++++++++++++++++----
 .../apache/tika/parser/image/PSDParserTest.java    |  15 ++++-
 .../test/resources/test-documents/testPSD_xmp.psd  | Bin 0 -> 114796 bytes
 6 files changed, 72 insertions(+), 14 deletions(-)
 create mode 100644 tika-parsers/src/test/resources/test-documents/testPSD_xmp.psd


[tika] 02/02: bump spring to avoid vulnerable code: https://ossindex.sonatype.org/vuln/fe1be8c0-575d-49bc-906d-582e1dd589dd

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d71d71dc063a3e03078c63a9582746221088f621
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 21 13:33:47 2020 -0500

    bump spring to avoid vulnerable code: https://ossindex.sonatype.org/vuln/fe1be8c0-575d-49bc-906d-582e1dd589dd
---
 tika-example/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tika-example/pom.xml b/tika-example/pom.xml
index 11c5035..a2b2628 100644
--- a/tika-example/pom.xml
+++ b/tika-example/pom.xml
@@ -152,7 +152,7 @@
     <dependency>
       <groupId>org.springframework</groupId>
       <artifactId>spring-context</artifactId>
-      <version>5.2.1.RELEASE</version>
+      <version>5.2.3.RELEASE</version>
       <exclusions>
         <exclusion>
           <groupId>commons-logging</groupId>


[tika] 01/02: TIKA-3050 -- add xmp extraction from PSD files

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ab8a9ed830ec710a32e4ffdf4989aea3aaea92ef
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 21 13:27:13 2020 -0500

    TIKA-3050 -- add xmp extraction from PSD files
---
 CHANGES.txt                                        |   2 +
 .../apache/tika/metadata/TikaCoreProperties.java   |   4 +-
 .../org/apache/tika/parser/image/PSDParser.java    |  63 +++++++++++++++++----
 .../apache/tika/parser/image/PSDParserTest.java    |  15 ++++-
 .../test/resources/test-documents/testPSD_xmp.psd  | Bin 0 -> 114796 bytes
 5 files changed, 71 insertions(+), 13 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index aa46d1e..b80b750 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,6 +7,8 @@ Release 2.0.0 - ???
 
 Release 1.24 - ???
 
+   * Extract XMP from PSD files (TIKA-3050).
+
    * Added XMLProfiler as an optional parser to profile XFA and XMP
      in PDFs (TIKA-3045).
 
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 262d354..73e3901 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -58,8 +58,8 @@ public interface TikaCoreProperties {
         ATTACHMENT,//standard attachment as in email
         MACRO, //any code that is intended to be run by the application
         METADATA, //e.g. xmp, xfa
-        FONT;//embedded font files
-        //what else?
+        FONT,//embedded font files
+        THUMBNAIL;//TODO: set this in parsers that handle thumbnails
     };
 
     /**
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
index 4d0510c..b78a366 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
@@ -16,16 +16,23 @@
  */
 package org.apache.tika.parser.image;
 
+import java.io.ByteArrayInputStream;
+import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 
-import org.apache.poi.util.IOUtils;
+import org.apache.commons.io.IOUtils;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.EndianUtils;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Photoshop;
 import org.apache.tika.metadata.TIFF;
@@ -33,6 +40,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -44,6 +52,9 @@ import static java.nio.charset.StandardCharsets.US_ASCII;
  * <p/>
  * Documentation on the file format is available from
  * http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/PhotoshopFileFormats.htm
+ *
+ * An MIT-licensed python parser with test files is:
+ * https://github.com/psd-tools/psd-tools
  */
 public class PSDParser extends AbstractParser {
 
@@ -56,6 +67,9 @@ public class PSDParser extends AbstractParser {
             Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
                     MediaType.image("vnd.adobe.photoshop"))));
 
+    private static final int MAX_DATA_LENGTH_BYTES = 1000000;
+    private static final int MAX_BLOCKS = 10000;
+
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
@@ -101,19 +115,27 @@ public class PSDParser extends AbstractParser {
 
         // Colour mode, eg Bitmap or RGB
         int colorMode = EndianUtils.readUShortBE(stream);
-        metadata.set(Photoshop.COLOR_MODE, Photoshop._COLOR_MODE_CHOICES_INDEXED[colorMode]);
+        if (colorMode < Photoshop._COLOR_MODE_CHOICES_INDEXED.length) {
+            metadata.set(Photoshop.COLOR_MODE, Photoshop._COLOR_MODE_CHOICES_INDEXED[colorMode]);
+        }
 
         // Next is the Color Mode section
         // We don't care about this bit
         long colorModeSectionSize = EndianUtils.readIntBE(stream);
-        stream.skip(colorModeSectionSize);
+        IOUtils.skipFully(stream, colorModeSectionSize);
 
         // Next is the Image Resources section
         // Check for certain interesting keys here
         long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
         long read = 0;
-        while (read < imageResourcesSectionSize) {
+        //if something is corrupt about this number, prevent an
+        //infinite loop by only reading 10000 blocks
+        int blocks = 0;
+        while (read < imageResourcesSectionSize && blocks < MAX_BLOCKS) {
             ResourceBlock rb = new ResourceBlock(stream);
+            if (rb.totalLength <= 0) {
+                //break;
+            }
             read += rb.totalLength;
 
             // Is it one we can do something useful with?
@@ -124,8 +146,12 @@ public class PSDParser extends AbstractParser {
             } else if (rb.id == ResourceBlock.ID_EXIF_3) {
                 // TODO Parse the EXIF info via ImageMetadataExtractor
             } else if (rb.id == ResourceBlock.ID_XMP) {
-                // TODO Parse the XMP info via ImageMetadataExtractor
+                //if there are multiple xmps in a file, this will
+                //overwrite the data from the earlier xmp
+                JempboxExtractor ex = new JempboxExtractor(metadata);
+                ex.parse(new ByteArrayInputStream(rb.data));
             }
+            blocks++;
         }
 
         // Next is the Layer and Mask Info
@@ -141,17 +167,21 @@ public class PSDParser extends AbstractParser {
     private static class ResourceBlock {
         private static final long SIGNATURE = 0x3842494d; // 8BIM
         private static final int ID_CAPTION = 0x03F0;
-        private static final int ID_URL = 0x040B;
         private static final int ID_EXIF_1 = 0x0422;
         private static final int ID_EXIF_3 = 0x0423;
         private static final int ID_XMP = 0x0424;
+        //TODO
+        private static final int ID_URL = 0x040B;
+        private static final int ID_AUTO_SAVE_FILE_PATH = 0x043E;
+        private static final int ID_THUMBNAIL_RESOURCE = 0x040C;
 
         private int id;
         private String name;
         private byte[] data;
         private int totalLength;
-
+        static int counter = 0;
         private ResourceBlock(InputStream stream) throws IOException, TikaException {
+            counter++;
             // Verify the signature
             long sig = EndianUtils.readIntBE(stream);
             if (sig != SIGNATURE) {
@@ -166,6 +196,9 @@ public class PSDParser extends AbstractParser {
             int nameLen = 0;
             while (true) {
                 int v = stream.read();
+                if (v < 0) {
+                    throw new EOFException();
+                }
                 nameLen++;
 
                 if (v == 0) {
@@ -182,16 +215,26 @@ public class PSDParser extends AbstractParser {
             }
 
             int dataLen = EndianUtils.readIntBE(stream);
+            if (dataLen < 0) {
+                throw new TikaException("data length must be >= 0: "+dataLen);
+            }
             if (dataLen % 2 == 1) {
                 // Data Length is even padded
                 dataLen = dataLen + 1;
             }
+            //protect against overflow
+            if (Integer.MAX_VALUE-dataLen < nameLen+10) {
+                throw new TikaException("data length is too long:"+dataLen);
+            }
             totalLength = 4 + 2 + nameLen + 4 + dataLen;
-
             // Do we have use for the data segment?
             if (captureData(id)) {
-               data = new byte[dataLen];
-               IOUtils.readFully(stream, data);
+                if (dataLen > MAX_DATA_LENGTH_BYTES) {
+                    throw new TikaException("data length must be < "+MAX_DATA_LENGTH_BYTES+
+                            ": "+dataLen);
+                }
+                data = new byte[dataLen];
+                IOUtils.readFully(stream, data);
             } else {
                 data = new byte[0];
                 IOUtils.skipFully(stream, dataLen);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
index 82ebe7b..f748ec5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
@@ -19,14 +19,20 @@ package org.apache.tika.parser.image;
 import static org.junit.Assert.assertEquals;
 
 import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
 
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.XMPMM;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.junit.Test;
 import org.xml.sax.helpers.DefaultHandler;
 
-public class PSDParserTest {
+public class PSDParserTest extends TikaTest {
 
     private final Parser parser = new PSDParser();
 
@@ -61,4 +67,11 @@ public class PSDParserTest {
         assertEquals("70", metadata.get(Metadata.IMAGE_LENGTH));
         assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
     }
+
+    @Test
+    public void testXMP() throws Exception {
+        Metadata metadata = getXML("testPSD_xmp.psd").metadata;
+        assertEquals("Adobe Photoshop CC 2014 (Macintosh)", metadata.get(XMPMM.HISTORY_SOFTWARE_AGENT));
+        assertEquals("xmp.iid:63681182-81a0-4035-b4b2-19bea6201c05", metadata.get(XMPMM.HISTORY_EVENT_INSTANCEID));
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testPSD_xmp.psd b/tika-parsers/src/test/resources/test-documents/testPSD_xmp.psd
new file mode 100644
index 0000000..707df93
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPSD_xmp.psd differ