You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/10/07 22:52:21 UTC
svn commit: r1180230 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/image/ main/resources/META-INF/services/
test/java/org/apache/tika/parser/image/ test/resources/test-documents/
Author: nick
Date: Fri Oct 7 20:52:20 2011
New Revision: 1180230
URL: http://svn.apache.org/viewvc?rev=1180230&view=rev
Log:
TIKA-682 Add a basic PSD metadata extracting Parser
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testPSD.psd (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=1180230&r1=1180229&r2=1180230&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java Fri Oct 7 20:52:20 2011
@@ -55,7 +55,6 @@ public class ImageParser extends Abstrac
MediaType.image("png"),
MediaType.image("vnd.wap.wbmp"),
MediaType.image("x-icon"),
- MediaType.image("x-psd"),
MediaType.image("x-xcf"))));
public Set<MediaType> getSupportedTypes(ParseContext context) {
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java?rev=1180230&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/PSDParser.java Fri Oct 7 20:52:20 2011
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TIFF;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for the Adobe Photoshop PSD File Format.
+ *
+ * Documentation on the file format is available from
+ * http://www.adobe.com/devnet-apps/photoshop/fileformatashtml/PhotoshopFileFormats.htm
+ */
+public class PSDParser extends AbstractParser {
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
+ MediaType.image("vnd.adobe.photoshop"))));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Check for the magic header signature
+ byte[] signature = new byte[4];
+ IOUtils.readFully(stream, signature);
+ if(signature[0] == (byte)'8' && signature[1] == (byte)'B' &&
+ signature[2] == (byte)'P' && signature[3] == (byte)'S') {
+ // Good, signature found
+ } else {
+ throw new TikaException("PSD/PSB magic signature invalid");
+ }
+
+ // Check the version
+ int version = EndianUtils.readUShortBE(stream);
+ if(version == 1 || version == 2) {
+ // Good, we support these two
+ } else {
+ throw new TikaException("Invalid PSD/PSB version " + version);
+ }
+
+ // Skip the reserved block
+ IOUtils.readFully(stream, new byte[6]);
+
+ // Number of channels in the image
+ int numChannels = EndianUtils.readUShortBE(stream);
+ // TODO Identify a suitable metadata key for this
+
+ // Width and Height
+ int height = EndianUtils.readIntBE(stream);
+ int width = EndianUtils.readIntBE(stream);
+ metadata.set(TIFF.IMAGE_LENGTH, height);
+ metadata.set(TIFF.IMAGE_WIDTH, width);
+
+ // Depth (bits per channel)
+ int depth = EndianUtils.readUShortBE(stream);
+ metadata.set(TIFF.BITS_PER_SAMPLE, Integer.toString(depth));
+
+ // Colour mode
+ // Bitmap = 0; Grayscale = 1; Indexed = 2; RGB = 3; CMYK = 4; Multichannel = 7; Duotone = 8; Lab = 9.
+ int colorMode = EndianUtils.readUShortBE(stream);
+ // TODO Identify a suitable metadata key for this
+
+ // Next is the Color Mode section
+ // We don't care about this bit
+ long colorModeSectionSize = EndianUtils.readIntBE(stream);
+ stream.skip(colorModeSectionSize);
+
+ // Next is the Image Resources section
+ // Check for certain interesting keys here
+ long imageResourcesSectionSize = EndianUtils.readIntBE(stream);
+ long read = 0;
+ while(read < imageResourcesSectionSize) {
+ ResourceBlock rb = new ResourceBlock(stream);
+ read += rb.totalLength;
+
+ // Is it one we can do something useful with?
+ if(rb.id == ResourceBlock.ID_CAPTION) {
+ metadata.add(Metadata.DESCRIPTION, rb.getDataAsString());
+ } else if(rb.id == ResourceBlock.ID_EXIF_1) {
+ // TODO Parse the EXIF info
+ } else if(rb.id == ResourceBlock.ID_EXIF_3) {
+ // TODO Parse the EXIF info
+ } else if(rb.id == ResourceBlock.ID_XMP) {
+ // TODO Parse the XMP info
+ }
+ }
+
+ // Next is the Layer and Mask Info
+ // Finally we have Image Data
+ // We can't do anything with these parts
+
+ // We don't have any helpful text, sorry...
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ private static class ResourceBlock {
+ private static final long SIGNATURE = 0x3842494d; // 8BIM
+ private static final int ID_CAPTION = 0x03F0;
+ private static final int ID_URL = 0x040B;
+ private static final int ID_EXIF_1 = 0x0422;
+ private static final int ID_EXIF_3 = 0x0423;
+ private static final int ID_XMP = 0x0424;
+
+ private int id;
+ private String name;
+ private byte[] data;
+ private int totalLength;
+ private ResourceBlock(InputStream stream) throws IOException, TikaException {
+ // Verify the signature
+ long sig = EndianUtils.readIntBE(stream);
+ if(sig != SIGNATURE) {
+ throw new TikaException("Invalid Image Resource Block Signature Found, got " +
+ sig + " 0x" + Long.toHexString(sig) + " but the spec defines " + SIGNATURE);
+ }
+
+ // Read the block
+ id = EndianUtils.readUShortBE(stream);
+
+ StringBuffer nameB = new StringBuffer();
+ int nameLen = 0;
+ while(true) {
+ int v = stream.read();
+ nameLen++;
+
+ if(v == 0) {
+ // Even size, may be padded
+ if(nameLen % 2 == 1) {
+ stream.read();
+ nameLen++;
+ }
+ break;
+ } else {
+ nameB.append((char)v);
+ }
+ name = nameB.toString();
+ }
+
+ int dataLen = EndianUtils.readIntBE(stream);
+ totalLength = 4 + 2 + nameLen + 4 + dataLen;
+
+ data = new byte[dataLen];
+ IOUtils.readFully(stream, data);
+ }
+
+ private String getDataAsString() {
+ // Will be null padded
+ return new String(data, 0, data.length-1, Charset.forName("ASCII"));
+ }
+ }
+}
Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1180230&r1=1180229&r2=1180230&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Fri Oct 7 20:52:20 2011
@@ -22,6 +22,7 @@ org.apache.tika.parser.feed.FeedParser
org.apache.tika.parser.font.TrueTypeParser
org.apache.tika.parser.html.HtmlParser
org.apache.tika.parser.image.ImageParser
+org.apache.tika.parser.image.PSDParser
org.apache.tika.parser.image.TiffParser
org.apache.tika.parser.iwork.IWorkPackageParser
org.apache.tika.parser.jpeg.JpegParser
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java?rev=1180230&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/PSDParserTest.java Fri Oct 7 20:52:20 2011
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.helpers.DefaultHandler;
+
+import junit.framework.TestCase;
+
+public class PSDParserTest extends TestCase {
+
+ private final Parser parser = new PSDParser();
+
+ /**
+ * Tests a very basic file, without much metadata
+ */
+ public void testPSD() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/x-psd");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testPSD.psd");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ assertEquals("537", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("51", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+ }
+}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPSD.psd
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPSD.psd?rev=1180230&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPSD.psd
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream