You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/10/07 22:52:21 UTC

svn commit: r1180230 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/image/ main/resources/META-INF/services/ test/java/org/apache/tika/parser/image/ test/resources/test-documents/

Author: nick
Date: Fri Oct  7 20:52:20 2011
New Revision: 1180230

URL: http://svn.apache.org/viewvc?rev=1180230&view=rev
Log:
TIKA-682 Add a basic PSD metadata extracting Parser

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPSD.psd   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
    tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=1180230&r1=1180229&r2=1180230&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java Fri Oct  7 20:52:20 2011
@@ -55,7 +55,6 @@ public class ImageParser extends Abstrac
                 MediaType.image("png"),
                 MediaType.image("vnd.wap.wbmp"),
                 MediaType.image("x-icon"),
-                MediaType.image("x-psd"),
                 MediaType.image("x-xcf"))));
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java?rev=1180230&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java Fri Oct  7 20:52:20 2011
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TIFF;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for the Adobe Photoshop PSD File Format.
+ * 
+ * Documentation on the file format is available from
+ * http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/PhotoshopFileFormats.htm
+ */
+public class PSDParser extends AbstractParser {
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+                MediaType.image("vnd.adobe.photoshop"))));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        // Check for the magic header signature
+        byte[] signature = new byte[4];
+        IOUtils.readFully(stream, signature);
+        if(signature[0] == (byte)'8' && signature[1] == (byte)'B' &&
+           signature[2] == (byte)'P' && signature[3] == (byte)'S') {
+           // Good, signature found
+        } else {
+           throw new TikaException("PSD/PSB magic signature invalid");
+        }
+        
+        // Check the version
+        int version = EndianUtils.readUShortBE(stream);
+        if(version == 1 || version == 2) {
+           // Good, we support these two
+        } else {
+           throw new TikaException("Invalid PSD/PSB version " + version);
+        }
+        
+        // Skip the reserved block
+        IOUtils.readFully(stream, new byte[6]);
+        
+        // Number of channels in the image
+        int numChannels = EndianUtils.readUShortBE(stream);
+        // TODO Identify a suitable metadata key for this
+
+        // Width and Height
+        int height = EndianUtils.readIntBE(stream);
+        int width = EndianUtils.readIntBE(stream);
+        metadata.set(TIFF.IMAGE_LENGTH, height);
+        metadata.set(TIFF.IMAGE_WIDTH, width);
+        
+        // Depth (bits per channel)
+        int depth = EndianUtils.readUShortBE(stream);
+        metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(depth));
+        
+        // Colour mode
+        // Bitmap = 0; Grayscale = 1; Indexed = 2; RGB = 3; CMYK = 4; Multichannel = 7; Duotone = 8; Lab = 9.
+        int colorMode = EndianUtils.readUShortBE(stream);
+        // TODO Identify a suitable metadata key for this
+        
+        // Next is the Color Mode section
+        // We don't care about this bit
+        long colorModeSectionSize = EndianUtils.readIntBE(stream);
+        stream.skip(colorModeSectionSize);
+
+        // Next is the Image Resources section
+        // Check for certain interesting keys here
+        long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
+        long read = 0;
+        while(read < imageResourcesSectionSize) {
+           ResourceBlock rb = new ResourceBlock(stream);
+           read += rb.totalLength;
+           
+           // Is it one we can do something useful with?
+           if(rb.id == ResourceBlock.ID_CAPTION) {
+              metadata.add(Metadata.DESCRIPTION, rb.getDataAsString()); 
+           } else if(rb.id == ResourceBlock.ID_EXIF_1) {
+              // TODO Parse the EXIF info
+           } else if(rb.id == ResourceBlock.ID_EXIF_3) {
+              // TODO Parse the EXIF info
+           } else if(rb.id == ResourceBlock.ID_XMP) {
+              // TODO Parse the XMP info
+           }
+        }
+        
+        // Next is the Layer and Mask Info
+        // Finally we have Image Data
+        // We can't do anything with these parts
+        
+        // We don't have any helpful text, sorry...
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+    
+    private static class ResourceBlock {
+       private static final long SIGNATURE = 0x3842494d; // 8BIM
+       private static final int ID_CAPTION = 0x03F0;
+       private static final int ID_URL = 0x040B;
+       private static final int ID_EXIF_1 = 0x0422;
+       private static final int ID_EXIF_3 = 0x0423;
+       private static final int ID_XMP = 0x0424;
+       
+       private int id;
+       private String name;
+       private byte[] data;
+       private int totalLength;
+       private ResourceBlock(InputStream stream) throws IOException, TikaException {
+          // Verify the signature
+          long sig = EndianUtils.readIntBE(stream);
+          if(sig != SIGNATURE) {
+             throw new TikaException("Invalid Image Resource Block Signature Found, got " +
+                   sig + " 0x" + Long.toHexString(sig) + " but the spec defines " + SIGNATURE);
+          }
+          
+          // Read the block
+          id = EndianUtils.readUShortBE(stream);
+          
+          StringBuffer nameB = new StringBuffer();
+          int nameLen = 0;
+          while(true) {
+             int v = stream.read();
+             nameLen++;
+             
+             if(v == 0) {
+                // Even size, may be padded
+                if(nameLen % 2 == 1) {
+                   stream.read();
+                   nameLen++;
+                }
+                break;
+             } else {
+                nameB.append((char)v);
+             }
+             name = nameB.toString();
+          }
+          
+          int dataLen = EndianUtils.readIntBE(stream);
+          totalLength = 4 + 2 + nameLen + 4 + dataLen;
+          
+          data = new byte[dataLen];
+          IOUtils.readFully(stream, data);
+       }
+       
+       private String getDataAsString() {
+          // Will be null padded
+          return new String(data, 0, data.length-1, Charset.forName("ASCII"));
+       }
+    }
+}

Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1180230&r1=1180229&r2=1180230&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Fri Oct  7 20:52:20 2011
@@ -22,6 +22,7 @@ org.apache.tika.parser.feed.FeedParser
 org.apache.tika.parser.font.TrueTypeParser
 org.apache.tika.parser.html.HtmlParser
 org.apache.tika.parser.image.ImageParser
+org.apache.tika.parser.image.PSDParser
 org.apache.tika.parser.image.TiffParser
 org.apache.tika.parser.iwork.IWorkPackageParser
 org.apache.tika.parser.jpeg.JpegParser

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java?rev=1180230&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java Fri Oct  7 20:52:20 2011
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.helpers.DefaultHandler;
+
+import junit.framework.TestCase;
+
+public class PSDParserTest extends TestCase {
+
+    private final Parser parser = new PSDParser();
+
+    /**
+     * Tests a very basic file, without much metadata
+     */
+    public void testPSD() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/x-psd");
+        InputStream stream =
+            getClass().getResourceAsStream("/test-documents/testPSD.psd");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+        assertEquals("537", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("51", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+    }
+}

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPSD.psd
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPSD.psd?rev=1180230&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPSD.psd
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream