You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/06/28 15:59:09 UTC

svn commit: r958581 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/metadata/ tika-parsers/src/main/java/org/apache/tika/parser/image/ tika-parsers/src/main/java/org/apache/tika/parser/jpeg/ tika-parsers/src/test/java/org/apache/tika/parser/i...

Author: nick
Date: Mon Jun 28 13:59:08 2010
New Revision: 958581

URL: http://svn.apache.org/viewvc?rev=958581&view=rev
Log:
Use the new TIFF Metadata entries for image width/length/sampling from the TIFF, JPEG and general Image (ImageIO) parsers. Gives a small number of consistent image related metadata entries across all formats. (TIKA-442)

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java Mon Jun 28 13:59:08 2010
@@ -25,7 +25,7 @@ import java.util.Properties;
  * A multi-valued metadata container.
  */
 public class Metadata implements CreativeCommons, DublinCore, HttpHeaders,
-        Message, MSOffice, ClimateForcast, TikaMetadataKeys, TikaMimeKeys {
+        Message, MSOffice, ClimateForcast, TIFF, TikaMetadataKeys, TikaMimeKeys {
 
     /**
      * A map of all metadata attributes.

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java Mon Jun 28 13:59:08 2010
@@ -32,6 +32,7 @@ import javax.imageio.metadata.IIOMetadat
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -70,6 +71,9 @@ public class ImageParser implements Pars
                     ImageReader reader = iterator.next();
                     reader.setInput(ImageIO.createImageInputStream(
                             new CloseShieldInputStream(stream)));
+                    
+                    metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
+                    metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
                     metadata.set("height", Integer.toString(reader.getHeight(0)));
                     metadata.set("width", Integer.toString(reader.getWidth(0)));
 
@@ -77,6 +81,12 @@ public class ImageParser implements Pars
 
                     reader.dispose();
                 }
+                
+                // Translate certain Metadata tags from the ImageIO
+                //  specific namespace into the general Tika one
+                setIfPresent(metadata, "CommentExtensions CommentExtension", Metadata.COMMENTS);
+                setIfPresent(metadata, "markerSequence com", Metadata.COMMENTS);
+                setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
             } catch (IIOException e) {
                 throw new TikaException(type + " parse error", e);
             }
@@ -95,6 +105,21 @@ public class ImageParser implements Pars
             throws IOException, SAXException, TikaException {
         parse(stream, handler, metadata, new ParseContext());
     }
+    
+    private static void setIfPresent(Metadata metadata, String imageIOkey, String tikaKey) {
+	if(metadata.get(imageIOkey) != null) {
+	    metadata.set(tikaKey, metadata.get(imageIOkey));
+	}
+    }
+    private static void setIfPresent(Metadata metadata, String imageIOkey, Property tikaProp) {
+	if(metadata.get(imageIOkey) != null) {
+	    String v = metadata.get(imageIOkey);
+	    if(v.endsWith(" ")) {
+		v = v.substring(0, v.lastIndexOf(' '));
+	    }
+	    metadata.set(tikaProp, v);
+	}
+    }
 
     private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) {
         String[] names = imageMetadata.getMetadataFormatNames();

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java Mon Jun 28 13:59:08 2010
@@ -19,9 +19,12 @@ package org.apache.tika.parser.image;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Iterator;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
 import org.xml.sax.SAXException;
 
 import com.drew.imaging.tiff.TiffMetadataReader;
@@ -30,15 +33,15 @@ import com.drew.metadata.Directory;
 import com.drew.metadata.MetadataException;
 import com.drew.metadata.Tag;
 
-class TiffExtractor {
+public class TiffExtractor {
 
     private final Metadata metadata;
 
-    public TiffExtractor(Metadata metadata) {
+    protected TiffExtractor(Metadata metadata) {
         this.metadata = metadata;
     }
 
-    public void parse(InputStream stream)
+    protected void parse(InputStream stream)
             throws IOException, SAXException, TikaException {
         try {
             com.drew.metadata.Metadata tiffMetadata =
@@ -52,6 +55,7 @@ class TiffExtractor {
                 while (tags.hasNext()) {
                     Tag tag = (Tag)tags.next();
                     metadata.set(tag.getTagName(), tag.getDescription());
+                    handleCommonImageTags(metadata, tag);
                 }
             }
         } catch (TiffProcessingException e) {
@@ -61,4 +65,55 @@ class TiffExtractor {
         }
     }
 
+
+    /**
+     * Maps common TIFF and EXIF tags onto the Tika
+     *  TIFF image metadata namespace.
+     */
+    public static void handleCommonImageTags(Metadata metadata, Tag tag) throws MetadataException {
+	// Core tags
+	if(tag.getTagName().equals("Date/Time") ||
+		tag.getTagType() == 306) {
+	    // Ensure it's in the right format
+	    String date = tag.getDescription();
+	    int splitAt = date.indexOf(' '); 
+	    if(splitAt > -1) {
+		date = date.substring(0, splitAt).replace(':', '/') +
+			date.substring(splitAt);
+	    }
+	    metadata.set(Metadata.DATE, date);
+	    return;
+	}
+	if(tag.getTagName().equals("Keywords") ||
+	        tag.getTagType() == 537) {
+	    metadata.set(Metadata.KEYWORDS, tag.getDescription());
+	}
+	
+	// EXIF / TIFF Tags
+	Property key = null;
+	if(tag.getTagName().equals("Image Width") ||
+		tag.getTagType() == 256) { 
+	    key = Metadata.IMAGE_WIDTH;
+	}
+	if(tag.getTagName().equals("Image Height") ||
+		tag.getTagType() == 257) {
+	    key = Metadata.IMAGE_LENGTH;
+	}
+	if(tag.getTagName().equals("Data Precision") ||
+		tag.getTagName().equals("Bits Per Sample") ||
+		tag.getTagType() == 258) {
+	    key = Metadata.BITS_PER_SAMPLE;
+	}
+	if(tag.getTagType() == 277) {
+	    key = Metadata.SAMPLES_PER_PIXEL;
+	}
+	
+	if(key != null) {
+	    Matcher m = LEADING_NUMBERS.matcher(tag.getDescription());
+	    if(m.matches()) {
+		metadata.set(key, m.group(1));
+	    }
+	}
+    }
+    private static final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java Mon Jun 28 13:59:08 2010
@@ -22,6 +22,7 @@ import java.util.Iterator;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.image.TiffExtractor;
 import org.xml.sax.SAXException;
 
 import com.drew.imaging.jpeg.JpegMetadataReader;
@@ -52,6 +53,7 @@ class JpegExtractor {
                 while (tags.hasNext()) {
                     Tag tag = (Tag)tags.next();
                     metadata.set(tag.getTagName(), tag.getDescription());
+                    TiffExtractor.handleCommonImageTags(metadata, tag);
                 }
             }
         } catch (JpegProcessingException e) {
@@ -60,5 +62,4 @@ class JpegExtractor {
             throw new TikaException("Can't read JPEG metadata", e);
         }
     }
-
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java Mon Jun 28 13:59:08 2010
@@ -43,6 +43,10 @@ public class ImageParserTest extends Tes
         assertEquals("0", metadata.get("Dimension HorizontalPhysicalPixelSpacing"));
         assertEquals("BI_RGB", metadata.get("Compression CompressionTypeName"));
         assertEquals("image/bmp", metadata.get("Content-Type"));
+        
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
     }
 
     public void testGIF() throws Exception {
@@ -69,6 +73,10 @@ public class ImageParserTest extends Tes
         assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension"));
         assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
         assertEquals("image/gif", metadata.get("Content-Type"));
+        
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(Metadata.COMMENTS));
     }
 
     public void testJPEG() throws Exception {
@@ -100,6 +108,10 @@ public class ImageParserTest extends Tes
         assertEquals("keyword=comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("Text TextEntry"));
         assertEquals("image/jpeg", metadata.get("Content-Type"));
         assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
+        
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(Metadata.COMMENTS));
     }
 
     public void testPNG() throws Exception {
@@ -133,6 +145,10 @@ public class ImageParserTest extends Tes
         assertEquals("true", metadata.get("Chroma BlackIsZero"));
         assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("Document ImageModificationTime"));
         assertEquals("image/png", metadata.get("Content-Type"));
+        
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
     }
 
 // TODO: Add TIFF support
@@ -145,6 +161,11 @@ public class ImageParserTest extends Tes
 //
 //        assertEquals("75", metadata.get("height"));
 //        assertEquals("100", metadata.get("width"));
+//    
+//        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+//        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+//        assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
+//        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(Metadata.COMMENTS));
 //    }
 
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java Mon Jun 28 13:59:08 2010
@@ -38,5 +38,14 @@ public class TiffParserTest extends Test
         		"more contributor license agreements.  See the NOTICE file " +
         		"distributed with this work for additional information regarding " +
         		"copyright ownership.", metadata.get("Image Description"));
+        
+        // All EXIF/TIFF tags
+        assertEquals("Inch", metadata.get("Resolution Unit"));
+        
+        // Core EXIF/TIFF tags
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL));
     }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Mon Jun 28 13:59:08 2010
@@ -33,7 +33,18 @@ public class JpegParserTest extends Test
             getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
         parser.parse(stream, new DefaultHandler(), metadata);
 
+        // All EXIF/TIFF tags
         assertEquals("Canon EOS 40D", metadata.get("Model"));
+        
+        // Core EXIF/TIFF tags
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+        
+        // Common tags
+        assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
+        assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
     }
 
 }