You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/04/14 11:38:59 UTC

svn commit: r933894 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/image/ main/resources/META-INF/services/ test/java/org/apache/tika/parser/image/

Author: jukka
Date: Wed Apr 14 09:38:59 2010
New Revision: 933894

URL: http://svn.apache.org/viewvc?rev=933894&view=rev
Log:
TIKA-92: Image metadata extraction

Patch by Dmitry Kuzmenko.

Added:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java   (with props)
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java   (with props)
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java   (with props)
Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
    lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=933894&r1=933893&r2=933894&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java Wed Apr 14 09:38:59 2010
@@ -48,7 +48,6 @@ public class ImageParser implements Pars
                 MediaType.image("bmp"),
                 MediaType.image("gif"),
                 MediaType.image("png"),
-                MediaType.image("tiff"),
                 MediaType.image("vnd.wap.wbmp"),
                 MediaType.image("x-icon"),
                 MediaType.image("x-psd"),

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java?rev=933894&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java Wed Apr 14 09:38:59 2010
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.SAXException;
+
+import com.drew.imaging.tiff.TiffMetadataReader;
+import com.drew.imaging.tiff.TiffProcessingException;
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+
+class TiffExtractor {
+
+    private final Metadata metadata;
+
+    public TiffExtractor(Metadata metadata) {
+        this.metadata = metadata;
+    }
+
+    public void parse(InputStream stream)
+            throws IOException, SAXException, TikaException {
+        try {
+            com.drew.metadata.Metadata tiffMetadata =
+                TiffMetadataReader.readMetadata(stream);
+
+            Iterator<?> directories = tiffMetadata.getDirectoryIterator();
+            while (directories.hasNext()) {
+                Directory directory = (Directory) directories.next();
+                Iterator<?> tags = directory.getTagIterator();
+
+                while (tags.hasNext()) {
+                    Tag tag = (Tag)tags.next();
+                    metadata.set(tag.getTagName(), tag.getDescription());
+                }
+            }
+        } catch (TiffProcessingException e) {
+            throw new TikaException("Can't read TIFF metadata", e);
+        } catch (MetadataException e) {
+            throw new TikaException("Can't read TIFF metadata", e);
+        }
+    }
+
+}

Propchange: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java?rev=933894&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java (added)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java Wed Apr 14 09:38:59 2010
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TiffParser implements Parser {
+
+    private static final Set<MediaType> SUPPORTED_TYPES =
+        Collections.singleton(MediaType.image("tiff"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * @deprecated This method will be removed in Apache Tika 1.0.
+     */
+    public void parse(
+            InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, SAXException, TikaException {
+        parse(stream, handler, metadata, new ParseContext());
+    }
+
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        new TiffExtractor(metadata).parse(stream);
+
+        for (String s : metadata.names()) {
+            if (s.startsWith("Unknown tag")) {
+                metadata.remove(s);
+            }
+        }
+
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+        xhtml.startDocument();
+        xhtml.endDocument();
+    }
+
+}

Propchange: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=933894&r1=933893&r2=933894&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ lucene/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Apr 14 09:38:59 2010
@@ -19,6 +19,7 @@ org.apache.tika.parser.audio.MidiParser
 org.apache.tika.parser.epub.EpubParser
 org.apache.tika.parser.html.HtmlParser
 org.apache.tika.parser.image.ImageParser
+org.apache.tika.parser.image.TiffParser
 org.apache.tika.parser.jpeg.JpegParser
 org.apache.tika.parser.mbox.MboxParser
 org.apache.tika.parser.microsoft.OfficeParser

Added: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java?rev=933894&view=auto
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java (added)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java Wed Apr 14 09:38:59 2010
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import junit.framework.TestCase;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.image.TiffParser;
+import org.apache.tika.metadata.Metadata;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.InputStream;
+
+public class TiffParserTest extends TestCase {
+    private final Parser parser = new TiffParser();
+
+    public void testTIFF() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+        InputStream stream =
+            getClass().getResourceAsStream("/test-documents/testTIFF.tif");
+        parser.parse(stream, new DefaultHandler(), metadata);
+
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or " +
+        		"more contributor license agreements.  See the NOTICE file " +
+        		"distributed with this work for additional information regarding " +
+        		"copyright ownership.", metadata.get("Image Description"));
+    }
+}

Propchange: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native