You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/04/14 11:34:55 UTC

svn commit: r933893 - in /lucene/tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/image/ImageParser.java test/java/org/apache/tika/parser/image/ImageParserTest.java

Author: jukka
Date: Wed Apr 14 09:34:54 2010
New Revision: 933893

URL: http://svn.apache.org/viewvc?rev=933893&view=rev
Log:
TIKA-92: Image metadata extraction

Patch by Dmitry Kuzmenko.

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
    lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=933893&r1=933892&r2=933893&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java Wed Apr 14 09:34:54 2010
@@ -16,7 +16,7 @@
  */
 package org.apache.tika.parser.image;
 
- import java.io.IOException;
+import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.Collections;
@@ -27,6 +27,7 @@ import java.util.Set;
 import javax.imageio.IIOException;
 import javax.imageio.ImageIO;
 import javax.imageio.ImageReader;
+import javax.imageio.metadata.IIOMetadata;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.CloseShieldInputStream;
@@ -35,6 +36,8 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -70,6 +73,9 @@ public class ImageParser implements Pars
                             new CloseShieldInputStream(stream)));
                     metadata.set("height", Integer.toString(reader.getHeight(0)));
                     metadata.set("width", Integer.toString(reader.getWidth(0)));
+
+                    loadMetadata(reader.getImageMetadata(0), metadata);
+
                     reader.dispose();
                 }
             } catch (IIOException e) {
@@ -91,4 +97,51 @@ public class ImageParser implements Pars
         parse(stream, handler, metadata, new ParseContext());
     }
 
+    private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) {
+        String[] names = imageMetadata.getMetadataFormatNames();
+        if (names == null) {
+            return;
+        }
+        int length = names.length;
+        for (int i = 0; i < length; i++) {
+            loadNode(metadata, imageMetadata.getAsTree(names[i]), "", false);
+        }
+    }
+
+    private static void loadNode(
+            Metadata metadata, Node node, String parents,
+            boolean addThisNodeName) {
+        if (addThisNodeName) {
+            if (parents.length() > 0) {
+                parents += " ";
+            }
+            parents += node.getNodeName();
+        }
+        NamedNodeMap map = node.getAttributes();
+        if (map != null) {
+
+            int length = map.getLength();
+            if (length == 1) {
+                metadata.add(parents, map.item(0).getNodeValue());
+            } else if (length > 1) {
+                StringBuffer value = new StringBuffer();
+                for (int i = 0; i < length; i++) {
+                    if (i > 0) {
+                        value.append(", ");
+                    }
+                    Node attr = map.item(i);
+                    value.append(attr.getNodeName()).append("=").append(attr.getNodeValue());
+                }
+                metadata.add(parents, value.toString());
+            }
+        }
+
+        Node child = node.getFirstChild();
+        while (child != null) {
+            // print children recursively
+            loadNode(metadata, child, parents, true);
+            child = child.getNextSibling();
+        }
+    }
+
 }

Modified: lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java?rev=933893&r1=933892&r2=933893&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java (original)
+++ lucene/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java Wed Apr 14 09:34:54 2010
@@ -37,6 +37,12 @@ public class ImageParserTest extends Tes
 
         assertEquals("75", metadata.get("height"));
         assertEquals("100", metadata.get("width"));
+        assertEquals("8 8 8 ", metadata.get("Data BitsPerSample"));
+        assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+        assertEquals("0", metadata.get("Dimension VerticalPhysicalPixelSpacing"));
+        assertEquals("0", metadata.get("Dimension HorizontalPhysicalPixelSpacing"));
+        assertEquals("BI_RGB", metadata.get("Compression CompressionTypeName"));
+        assertEquals("image/bmp", metadata.get("Content-Type"));
     }
 
     public void testGIF() throws Exception {
@@ -48,6 +54,21 @@ public class ImageParserTest extends Tes
 
         assertEquals("75", metadata.get("height"));
         assertEquals("100", metadata.get("width"));
+        assertEquals("TRUE", metadata.get("Compression Lossless"));
+        assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
+        assertEquals("lzw", metadata.get("Compression CompressionTypeName"));
+        assertEquals("0", metadata.get("Dimension HorizontalPixelOffset"));
+        assertEquals("imageLeftPosition=0, imageTopPosition=0, imageWidth=100, imageHeight=75, interlaceFlag=false", metadata.get("ImageDescriptor"));
+        assertEquals("Index", metadata.get("Data SampleFormat"));
+        assertEquals("3", metadata.get("Chroma NumChannels"));
+        assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+        assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("CommentExtensions CommentExtension"));
+        assertEquals("value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
+        assertEquals("TRUE", metadata.get("Chroma BlackIsZero"));
+        assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension"));
+        assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
+        assertEquals("image/gif", metadata.get("Content-Type"));
     }
 
     public void testJPEG() throws Exception {
@@ -59,6 +80,26 @@ public class ImageParserTest extends Tes
 
         assertEquals("75", metadata.get("height"));
         assertEquals("100", metadata.get("width"));
+        assertEquals("0.35277778", metadata.get("Dimension VerticalPixelSize"));
+        assertEquals("false", metadata.get("Compression Lossless"));
+        assertEquals("class=0, htableId=0", metadata.get("markerSequence dht dhtable"));
+        assertEquals("majorVersion=1, minorVersion=1, resUnits=1, Xdensity=72, Ydensity=72, thumbWidth=0, thumbHeight=0", metadata.get("JPEGvariety app0JFIF"));
+        assertEquals("225", metadata.get("markerSequence unknown"));
+        assertEquals("componentSelector=1, dcHuffTable=0, acHuffTable=0", metadata.get("markerSequence sos scanComponentSpec"));
+        assertEquals("normal", metadata.get("Dimension ImageOrientation"));
+        assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+        assertEquals("elementPrecision=0, qtableId=0", metadata.get("markerSequence dqt dqtable"));
+        assertEquals("numScanComponents=3, startSpectralSelection=0, endSpectralSelection=63, approxHigh=0, approxLow=0", metadata.get("markerSequence sos"));
+        assertEquals("componentId=1, HsamplingFactor=1, VsamplingFactor=1, QtableSelector=0", metadata.get("markerSequence sof componentSpec"));
+        assertEquals("JPEG", metadata.get("Compression CompressionTypeName"));
+        assertEquals("0.35277778", metadata.get("Dimension HorizontalPixelSize"));
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("markerSequence com"));
+        assertEquals("3", metadata.get("Chroma NumChannels"));
+        assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+        assertEquals("YCbCr", metadata.get("Chroma ColorSpaceType"));
+        assertEquals("keyword=comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("Text TextEntry"));
+        assertEquals("image/jpeg", metadata.get("Content-Type"));
+        assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
     }
 
     public void testPNG() throws Exception {
@@ -70,6 +111,28 @@ public class ImageParserTest extends Tes
 
         assertEquals("75", metadata.get("height"));
         assertEquals("100", metadata.get("width"));
+        assertEquals("0.35273367", metadata.get("Dimension VerticalPixelSize"));
+        assertEquals("8 8 8", metadata.get("Data BitsPerSample"));
+        assertEquals("Perceptual", metadata.get("sRGB"));
+        assertEquals("true", metadata.get("Compression Lossless"));
+        assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("tIME"));
+        assertEquals("Normal", metadata.get("Dimension ImageOrientation"));
+        assertEquals("1.0", metadata.get("Dimension PixelAspectRatio"));
+        assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("tEXt tEXtEntry"));
+        assertEquals("deflate", metadata.get("Compression CompressionTypeName"));
+        assertEquals("UnsignedIntegral", metadata.get("Data SampleFormat"));
+        assertEquals("0.35273367", metadata.get("Dimension HorizontalPixelSize"));
+        assertEquals("none", metadata.get("Transparency Alpha"));
+        assertEquals("pixelsPerUnitXAxis=2835, pixelsPerUnitYAxis=2835, unitSpecifier=meter", metadata.get("pHYs"));
+        assertEquals("3", metadata.get("Chroma NumChannels"));
+        assertEquals("1", metadata.get("Compression NumProgressiveScans"));
+        assertEquals("RGB", metadata.get("Chroma ColorSpaceType"));
+        assertEquals("keyword=Comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership., encoding=ISO-8859-1, compression=none", metadata.get("Text TextEntry"));
+        assertEquals("PixelInterleaved", metadata.get("Data PlanarConfiguration"));
+        assertEquals("width=100, height=75, bitDepth=8, colorType=RGB, compressionMethod=deflate, filterMethod=adaptive, interlaceMethod=none", metadata.get("IHDR"));
+        assertEquals("true", metadata.get("Chroma BlackIsZero"));
+        assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("Document ImageModificationTime"));
+        assertEquals("image/png", metadata.get("Content-Type"));
     }
 
 // TODO: Add TIFF support