You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/11/10 16:57:34 UTC

svn commit: r1033546 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/image/ main/java/org/apache/tika/parser/jpeg/ test/java/org/apache/tika/parser/image/ test/java/org/apache/tika/parser/image/xmp/ test/java/org/apache/tika/parser/...

Author: nick
Date: Wed Nov 10 15:57:34 2010
New Revision: 1033546

URL: http://svn.apache.org/viewvc?rev=1033546&view=rev
Log:
Improved extraction of EXIF and IPTC metadata from JPEG and TIFF Images (TIKA-482)
(Applys patch from Staffan Olsson from TIKA-482)

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_pspcs2mac.jpg   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testTIFF.tif

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java Wed Nov 10 15:57:34 2010
@@ -41,303 +41,439 @@ import com.drew.lang.Rational;
 import com.drew.metadata.Directory;
 import com.drew.metadata.MetadataException;
 import com.drew.metadata.Tag;
-
+import com.drew.metadata.exif.ExifDirectory;
+import com.drew.metadata.exif.GpsDirectory;
+import com.drew.metadata.iptc.IptcDirectory;
+import com.drew.metadata.jpeg.JpegCommentDirectory;
+import com.drew.metadata.jpeg.JpegDirectory;
+
+/**
+ * Uses the <a href="http://www.drewnoakes.com/code/exif/">Metadata Extractor</a> library
+ * to read EXIF and IPTC image metadata and map to Tika fields.
+ * 
+ * As of 2.4.0 the library supports jpeg and tiff.
+ */
 public class ImageMetadataExtractor {
 
     private final Metadata metadata;
+    private DirectoryHandler[] handlers;
 
+    /**
+     * @param metadata to extract to, using default directory handlers
+     */
     public ImageMetadataExtractor(Metadata metadata) {
+        this(metadata,
+            new CopyUnknownFieldsHandler(),
+            new JpegCommentHandler(),
+            new ExifHandler(),
+            new DimensionsHandler(),
+            new GeotagHandler(),
+            new IptcHandler()
+        );
+    }
+    
+    /**
+     * @param metadata to extract to
+     * @param handlers handlers in order, note that handlers may override values from earlier handlers
+     */
+    public ImageMetadataExtractor(Metadata metadata, DirectoryHandler... handlers) {
         this.metadata = metadata;
+        this.handlers = handlers;
     }
 
-    public void parseTiff(InputStream stream)
+    public void parseJpeg(InputStream stream)
             throws IOException, SAXException, TikaException {
         try {
-            com.drew.metadata.Metadata tiffMetadata =
-                TiffMetadataReader.readMetadata(stream);
-            parse(tiffMetadata);
-        } catch (TiffProcessingException e) {
-            throw new TikaException("Can't read TIFF metadata", e);
+            com.drew.metadata.Metadata jpegMetadata =
+                JpegMetadataReader.readMetadata(stream);
+
+            handle(jpegMetadata);
+        } catch (JpegProcessingException e) {
+            throw new TikaException("Can't read JPEG metadata", e);
+        } catch (MetadataException e) {
+            throw new TikaException("Can't read JPEG metadata", e);
         }
     }
     
-    public void parseJpeg(InputStream stream)
-            throws IOException, SAXException, TikaException {
-       try {
-          com.drew.metadata.Metadata jpegMetadata =
-             JpegMetadataReader.readMetadata(stream);
-          parse(jpegMetadata);
-       } catch (JpegProcessingException e) {
-          throw new TikaException("Can't read JPEG metadata", e);
-       }
-    }
-    
-    protected void parse(com.drew.metadata.Metadata imageMetadata)
+    protected void parseTiff(InputStream stream)
             throws IOException, SAXException, TikaException {
-       try {
-          Iterator<?> directories = imageMetadata.getDirectoryIterator();
-          while (directories.hasNext()) {
-             Directory directory = (Directory) directories.next();
-             Iterator<?> tags = directory.getTagIterator();
+        try {
+            com.drew.metadata.Metadata tiffMetadata =
+                TiffMetadataReader.readMetadata(stream);
 
-             while (tags.hasNext()) {
-                Tag tag = (Tag)tags.next();
-                metadata.set(tag.getTagName(), tag.getDescription());
-                handleCommonImageTags(metadata, tag, directory);
-             }
-             handleGeoImageTags(metadata);
-          }
-       } catch (MetadataException e) {
-          throw new TikaException("Can't read TIFF/JPEG metadata", e);
-       }
+            handle(tiffMetadata);
+        } catch (TiffProcessingException e) {
+            throw new TikaException("Can't read TIFF metadata", e);
+        } catch (MetadataException e) {
+            throw new TikaException("Can't read TIFF metadata", e);
+        }
     }
 
     /**
-     * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
-     * Needs to be run at the end, because the GPS information
-     *  is spread across several EXIF tags.
+     * Copies extracted tags to tika metadata using registered handlers.
+     * @param metadataExtractor Tag directories from a Metadata Extractor "reader"
+     * @throws MetadataException This method does not handle exceptions from Metadata Extractor
      */
-    public static void handleGeoImageTags(Metadata metadata) {
-        String lat = metadata.get("GPS Latitude");
-        String latNS = metadata.get("GPS Latitude Ref");
-        if(lat != null) {
-            Double latitude = parseHMS(lat);
-            if(latitude != null) {
-                if(latNS != null && latNS.equalsIgnoreCase("S") &&
-                        latitude > 0) {
-                    latitude *= -1;
-                }
-                metadata.set(Metadata.LATITUDE, LAT_LONG_FORMAT.format(latitude)); 
-            }
-        }
+    @SuppressWarnings("unchecked")
+    protected void handle(com.drew.metadata.Metadata metadataExtractor) 
+            throws MetadataException {
+        handle(metadataExtractor.getDirectoryIterator());
+    }
 
-        String lng = metadata.get("GPS Longitude");
-        String lngEW = metadata.get("GPS Longitude Ref");
-        if(lng != null) {
-            Double longitude = parseHMS(lng);
-            if(longitude != null) {
-                if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
-                        longitude > 0) {
-                    longitude *= -1;
+    /**
+     * Copies extracted tags to tika metadata using registered handlers.
+     * @param directories Metadata Extractor {@link com.drew.metadata.Directory} instances.
+     * @throws MetadataException This method does not handle exceptions from Metadata Extractor
+     */    
+    protected void handle(Iterator<Directory> directories) throws MetadataException {
+        while (directories.hasNext()) {
+            Directory directory = directories.next();
+            for (int i = 0; i < handlers.length; i++) {
+                if (handlers[i].supports(directory.getClass())) {
+                    handlers[i].handle(directory, metadata);
                 }
-                metadata.set(Metadata.LONGITUDE, LAT_LONG_FORMAT.format(longitude));
             }
         }
     }
-    private static Double parseHMS(String hms) {
-       Matcher m = HOURS_MINUTES_SECONDS.matcher(hms);
-       if(m.matches()) {
-          double value = 
-            Integer.parseInt(m.group(1)) +
-            (Integer.parseInt(m.group(2))/60.0) +
-            (Double.parseDouble(m.group(3))/60.0/60.0);
-          return value;
-       }
-       return null;
-    }
-    private static final Pattern HOURS_MINUTES_SECONDS = Pattern.compile("(-?\\d+)\"(\\d+)'(\\d+\\.?\\d*)");
+
     /**
-     * The decimal format used for expressing latitudes and longitudes.
-     * The basic geo vocabulary defined by W3C (@see {@link Geographic})
-     * refers to the "float" type in XML Schema as the recommended format
-     * for latitude and longitude values.
+     * Reads one or more type of Metadata Extractor fields.
      */
-    private static final DecimalFormat LAT_LONG_FORMAT =
-        new DecimalFormat("##0.0####", new DecimalFormatSymbols(Locale.US));
+    static interface DirectoryHandler {
+        /**
+         * @param directorySubclass A Metadata Extractor directory class
+         * @return true if the directory type is supported by this handler
+         */
+        boolean supports(Class<? extends Directory> directoryType);
+        /**
+         * @param directory extracted tags
+         * @param metadata current tika metadata
+         * @throws MetadataException typically field extraction error, aborts all further extraction
+         */
+        void handle(Directory directory, Metadata metadata) 
+                throws MetadataException;
+    }
 
     /**
-     * We normally won't know what timezone our dates belong to
+     * Mimics the behavior from TIKA-314 of copying all extracted tags
+     * to tika metadata using field names from Metadata Extractor.
      */
-    private static final SimpleDateFormat DATE_UNSPECIFIED_TZ = 
-       new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
-    private static synchronized void handleDate(Property property, Metadata metadata, Tag tag, Directory directory) throws MetadataException {
-       Date date = directory.getDate(tag.getTagType());
-       String dateString = DATE_UNSPECIFIED_TZ.format(date);
-       metadata.set(property, dateString);
-    }
+    static class CopyAllFieldsHandler implements DirectoryHandler {
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return true;
+        }
+        public void handle(Directory directory, Metadata metadata)
+                throws MetadataException {
+            Iterator<?> tags = directory.getTagIterator();
+            while (tags.hasNext()) {
+                Tag tag = (Tag) tags.next();
+                metadata.set(tag.getTagName(), tag.getDescription());
+            }
+        }
+    }    
     
     /**
-     * Maps common TIFF and EXIF tags onto the Tika
-     *  TIFF image metadata namespace.
+     * Copies all fields regardless of directory, if the tag name
+     * is not identical to a known Metadata field name.
+     * This leads to more predictable behavior than {@link CopyAllFieldsHandler}.
      */
-    public static void handleCommonImageTags(Metadata metadata, Tag tag, Directory directory) throws MetadataException {
-        // Core tags
-        if(tag.getTagName().equals("Date/Time") ||
-                tag.getTagType() == 306) {
-            handleDate(Metadata.DATE, metadata, tag, directory);
-            metadata.set(Metadata.LAST_MODIFIED, metadata.get(Metadata.DATE));
-            return;
-        }
-        if(tag.getTagName().equals("Date/Time Original") ||
-                tag.getTagType() == 36867) {
-           handleDate(Metadata.ORIGINAL_DATE, metadata, tag, directory);
-           return;
-        }
-        
-        if(tag.getTagName().equals("Exposure Time") ||
-                tag.getTagType() == 33434) {
-           Object exposure = directory.getObject(tag.getTagType());
-           if(exposure instanceof Rational) {
-              metadata.set(Metadata.EXPOSURE_TIME, ((Rational)exposure).doubleValue());
-           } else {
-              metadata.set(Metadata.EXPOSURE_TIME, tag.getDescription());
-           }
-           return;
-        }
-        
-        if(tag.getTagName().equals("Flash") ||
-                tag.getTagType() == 37385) {
-           String flash = tag.getDescription();
-           if(flash.indexOf("Flash fired") > -1) {
-              metadata.set(Metadata.FLASH_FIRED, Boolean.TRUE.toString());
-           }
-           else if(flash.indexOf("Flash did not fire") > -1) {
-              metadata.set(Metadata.FLASH_FIRED, Boolean.FALSE.toString());
-           }
-           else {
-              metadata.set(Metadata.FLASH_FIRED, flash);
-           }
-           return;
-        }
-
-        if(tag.getTagName().equals("F-Number") ||
-                tag.getTagType() == 33437) {
-           Object fnumber = directory.getObject(tag.getTagType());
-           if(fnumber instanceof Rational) {
-              metadata.set(Metadata.F_NUMBER, ((Rational)fnumber).doubleValue());
-           } else {
-              metadata.set(Metadata.F_NUMBER, tag.getDescription());
-           }
-           return;
-        }
-        
-        if(tag.getTagName().equals("Focal Length") ||
-                tag.getTagType() == 37386) {
-           Object length = directory.getObject(tag.getTagType());
-           if(length instanceof Rational) {
-              metadata.set(Metadata.FOCAL_LENGTH, ((Rational)length).doubleValue());
-           } else {
-              metadata.set(Metadata.FOCAL_LENGTH, tag.getDescription());
-           }
-           return;
+    static class CopyUnknownFieldsHandler implements DirectoryHandler {
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return true;
+        }
+        public void handle(Directory directory, Metadata metadata)
+                throws MetadataException {
+            Iterator<?> tags = directory.getTagIterator();
+            while (tags.hasNext()) {
+                Tag tag = (Tag) tags.next();
+                String name = tag.getTagName();
+                if (!MetadataFields.isMetadataField(name)) {
+                    metadata.set(name, tag.getDescription());
+                }
+            }
         }
-        
-        if(tag.getTagName().equals("ISO Speed Ratings") ||
-                tag.getTagType() == 34855) {
-           metadata.set(Metadata.ISO_SPEED_RATINGS, tag.getDescription());
-           return;
-        }
-      
-        if(tag.getTagName().equals("Make") ||
-                tag.getTagType() == 271) {
-           metadata.set(Metadata.EQUIPMENT_MAKE, tag.getDescription());
-           return;
-        }
-        if(tag.getTagName().equals("Model") ||
-                tag.getTagType() == 272) {
-           metadata.set(Metadata.EQUIPMENT_MODEL, tag.getDescription());
-           return;
-        }
-      
-        if(tag.getTagName().equals("Orientation") ||
-                tag.getTagType() == 274) {
-           Object length = directory.getObject(tag.getTagType());
-           if(length instanceof Integer) {
-              metadata.set(Metadata.ORIENTATION, Integer.toString( ((Integer)length).intValue() ));
-           } else {
-              metadata.set(Metadata.ORIENTATION, tag.getDescription());
-           }
-           return;
+    }
+    
+    /**
+     * Basic image properties for TIFF and JPEG, at least.
+     */
+    static class DimensionsHandler implements DirectoryHandler {
+        private final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return directoryType == JpegDirectory.class || directoryType == ExifDirectory.class;
+        }
+        public void handle(Directory directory, Metadata metadata) throws MetadataException {
+            // The test TIFF has width and height stored as follows according to exiv2
+            //Exif.Image.ImageWidth                        Short       1  100
+            //Exif.Image.ImageLength                       Short       1  75
+            // and the values are found in "Thumbnail Image Width" (and Height) from Metadata Extractor
+            set(directory, metadata, ExifDirectory.TAG_THUMBNAIL_IMAGE_WIDTH, Metadata.IMAGE_WIDTH);
+            set(directory, metadata, JpegDirectory.TAG_JPEG_IMAGE_WIDTH, Metadata.IMAGE_WIDTH);
+            set(directory, metadata, ExifDirectory.TAG_THUMBNAIL_IMAGE_HEIGHT, Metadata.IMAGE_LENGTH);
+            set(directory, metadata, JpegDirectory.TAG_JPEG_IMAGE_HEIGHT, Metadata.IMAGE_LENGTH);
+            // Bits per sample, two methods of extracting, exif overrides jpeg
+            set(directory, metadata, JpegDirectory.TAG_JPEG_DATA_PRECISION, Metadata.BITS_PER_SAMPLE);
+            set(directory, metadata, ExifDirectory.TAG_BITS_PER_SAMPLE, Metadata.BITS_PER_SAMPLE);
+            // Straightforward
+            set(directory, metadata, ExifDirectory.TAG_SAMPLES_PER_PIXEL, Metadata.SAMPLES_PER_PIXEL);
+        }
+        private void set(Directory directory, Metadata metadata, int extractTag, Property metadataField) {
+            if (directory.containsTag(extractTag)) {
+                Matcher m = LEADING_NUMBERS.matcher(directory.getString(extractTag));
+                if(m.matches()) {
+                    metadata.set(metadataField, m.group(1));
+                }
+            }
         }
-        
-        if(tag.getTagName().equals("Software") ||
-                tag.getTagType() == 305) {
-           metadata.set(Metadata.SOFTWARE, tag.getDescription());
-           return;
-        }
-        
-        if(tag.getTagName().equals("X Resolution") ||
-                tag.getTagType() == 282) {
-           Object resolution = directory.getObject(tag.getTagType());
-           if(resolution instanceof Rational) {
-              metadata.set(Metadata.RESOLUTION_HORIZONTAL, ((Rational)resolution).doubleValue());
-           } else {
-              metadata.set(Metadata.RESOLUTION_HORIZONTAL, tag.getDescription());
-           }
-           return;
+    }
+    
+    static class JpegCommentHandler implements DirectoryHandler {
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return directoryType == JpegCommentDirectory.class;
+        }
+        public void handle(Directory directory, Metadata metadata) throws MetadataException {
+            if (directory.containsTag(JpegCommentDirectory.TAG_JPEG_COMMENT)) {
+                metadata.add(Metadata.COMMENT, directory.getString(JpegCommentDirectory.TAG_JPEG_COMMENT));
+            }
         }
-        if(tag.getTagName().equals("Y Resolution") ||
-                tag.getTagType() == 283) {
-           Object resolution = directory.getObject(tag.getTagType());
-           if(resolution instanceof Rational) {
-              metadata.set(Metadata.RESOLUTION_VERTICAL, ((Rational)resolution).doubleValue());
-           } else {
-              metadata.set(Metadata.RESOLUTION_VERTICAL, tag.getDescription());
-           }
-           return;
+    }
+    
+    static class ExifHandler implements DirectoryHandler {
+        private static final SimpleDateFormat DATE_UNSPECIFIED_TZ = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return directoryType == ExifDirectory.class;
+        }
+        public void handle(Directory directory, Metadata metadata) {
+            try {
+                handleDateTags(directory, metadata);
+                handlePhotoTags(directory, metadata);
+                handleCommentTags(directory, metadata);
+            } catch (MetadataException e) {
+                // ignore date parse errors and proceed with other tags
+            }
         }
-        if(tag.getTagName().equals("Resolution Unit") ||
-                tag.getTagType() == 296) {
-           metadata.set(Metadata.RESOLUTION_UNIT, tag.getDescription());
-           return;
+        /**
+         * EXIF may contain image description, although with undefined encoding.
+         * Use IPTC for other annotation fields, and XMP for unicode support.
+         */
+        public void handleCommentTags(Directory directory, Metadata metadata) {
+            if (metadata.get(Metadata.DESCRIPTION) == null &&
+                    directory.containsTag(ExifDirectory.TAG_IMAGE_DESCRIPTION)) {
+                metadata.set(Metadata.DESCRIPTION, directory.getString(ExifDirectory.TAG_IMAGE_DESCRIPTION));
+            }
         }
+        /**
+         * Maps common TIFF and EXIF tags onto the Tika
+         *  TIFF image metadata namespace.
+         */       
+        public void handlePhotoTags(Directory directory, Metadata metadata) {
+            if(directory.containsTag(ExifDirectory.TAG_EXPOSURE_TIME)) {
+               Object exposure = directory.getObject(ExifDirectory.TAG_EXPOSURE_TIME);
+               if(exposure instanceof Rational) {
+                  metadata.set(Metadata.EXPOSURE_TIME, ((Rational)exposure).doubleValue());
+               } else {
+                  metadata.set(Metadata.EXPOSURE_TIME, directory.getString(ExifDirectory.TAG_EXPOSURE_TIME));
+               }
+            }
+            
+            if(directory.containsTag(ExifDirectory.TAG_FLASH)) {
+               String flash = "";
+               try {
+                  flash = directory.getDescription(ExifDirectory.TAG_FLASH);
+               } catch (MetadataException e) {
+                  // ignore
+               }
+               if(flash.indexOf("Flash fired") > -1) {
+                  metadata.set(Metadata.FLASH_FIRED, Boolean.TRUE.toString());
+               }
+               else if(flash.indexOf("Flash did not fire") > -1) {
+                  metadata.set(Metadata.FLASH_FIRED, Boolean.FALSE.toString());
+               }
+               else {
+                  metadata.set(Metadata.FLASH_FIRED, flash);
+               }
+            }
 
-        if(tag.getTagName().equals("Keywords") ||
-                tag.getTagType() == 537) {
-            metadata.set(Metadata.KEYWORDS, tag.getDescription());
-            return;
+            if(directory.containsTag(ExifDirectory.TAG_FNUMBER)) {
+               Object fnumber = directory.getObject(ExifDirectory.TAG_FNUMBER);
+               if(fnumber instanceof Rational) {
+                  metadata.set(Metadata.F_NUMBER, ((Rational)fnumber).doubleValue());
+               } else {
+                  metadata.set(Metadata.F_NUMBER, directory.getString(ExifDirectory.TAG_FNUMBER));
+               }
+            }
+            
+            if(directory.containsTag(ExifDirectory.TAG_FOCAL_LENGTH)) {
+               Object length = directory.getObject(ExifDirectory.TAG_FOCAL_LENGTH);
+               if(length instanceof Rational) {
+                  metadata.set(Metadata.FOCAL_LENGTH, ((Rational)length).doubleValue());
+               } else {
+                  metadata.set(Metadata.FOCAL_LENGTH, directory.getString(ExifDirectory.TAG_FOCAL_LENGTH));
+               }
+            }
+            
+            if(directory.containsTag(ExifDirectory.TAG_ISO_EQUIVALENT)) {
+               metadata.set(Metadata.ISO_SPEED_RATINGS, directory.getString(ExifDirectory.TAG_ISO_EQUIVALENT));
+            }
+          
+            if(directory.containsTag(ExifDirectory.TAG_MAKE)) {
+               metadata.set(Metadata.EQUIPMENT_MAKE, directory.getString(ExifDirectory.TAG_MAKE));
+            }
+            if(directory.containsTag(ExifDirectory.TAG_MODEL)) {
+               metadata.set(Metadata.EQUIPMENT_MODEL, directory.getString(ExifDirectory.TAG_MODEL));
+            }
+          
+            if(directory.containsTag(ExifDirectory.TAG_ORIENTATION)) {
+               Object length = directory.getObject(ExifDirectory.TAG_ORIENTATION);
+               if(length instanceof Integer) {
+                  metadata.set(Metadata.ORIENTATION, Integer.toString( ((Integer)length).intValue() ));
+               } else {
+                  metadata.set(Metadata.ORIENTATION, directory.getString(ExifDirectory.TAG_ORIENTATION));
+               }
+            }
+            
+            if(directory.containsTag(ExifDirectory.TAG_SOFTWARE)) {
+               metadata.set(Metadata.SOFTWARE, directory.getString(ExifDirectory.TAG_SOFTWARE));
+            }
+            
+            if(directory.containsTag(ExifDirectory.TAG_X_RESOLUTION)) {
+               Object resolution = directory.getObject(ExifDirectory.TAG_X_RESOLUTION);
+               if(resolution instanceof Rational) {
+                  metadata.set(Metadata.RESOLUTION_HORIZONTAL, ((Rational)resolution).doubleValue());
+               } else {
+                  metadata.set(Metadata.RESOLUTION_HORIZONTAL, directory.getString(ExifDirectory.TAG_X_RESOLUTION));
+               }
+            }
+            if(directory.containsTag(ExifDirectory.TAG_Y_RESOLUTION)) {
+               Object resolution = directory.getObject(ExifDirectory.TAG_Y_RESOLUTION);
+               if(resolution instanceof Rational) {
+                  metadata.set(Metadata.RESOLUTION_VERTICAL, ((Rational)resolution).doubleValue());
+               } else {
+                  metadata.set(Metadata.RESOLUTION_VERTICAL, directory.getString(ExifDirectory.TAG_Y_RESOLUTION));
+               }
+            }
+            if(directory.containsTag(ExifDirectory.TAG_RESOLUTION_UNIT)) {
+               try {
+                  metadata.set(Metadata.RESOLUTION_UNIT, directory.getDescription(ExifDirectory.TAG_RESOLUTION_UNIT));
+               } catch (MetadataException e) {
+                  // ignore
+               }
+            }
         }
-        
-        if(tag.getTagName().equals("Jpeg Comment")) {
-            metadata.set(Metadata.COMMENTS, tag.getDescription());
-            return;
+        /**
+         * Maps exif dates to metadata fields.
+         */
+        public void handleDateTags(Directory directory, Metadata metadata)
+                throws MetadataException {
+            // Date/Time Original overrides value from ExifDirectory.TAG_DATETIME
+            Date original = null;
+            if (directory.containsTag(ExifDirectory.TAG_DATETIME_ORIGINAL)) {
+                original = directory.getDate(ExifDirectory.TAG_DATETIME_ORIGINAL);
+                // Unless we have GPS time we don't know the time zone so date must be set
+                // as ISO 8601 datetime without timezone suffix (no Z or +/-)
+                String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.format(original); // Same time zone as Metadata Extractor uses
+                metadata.set(Metadata.DATE, datetimeNoTimeZone);
+                metadata.set(Metadata.ORIGINAL_DATE, datetimeNoTimeZone);
+            }
+            if (directory.containsTag(ExifDirectory.TAG_DATETIME)) {
+                Date datetime = directory.getDate(ExifDirectory.TAG_DATETIME);
+                String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.format(datetime);
+                metadata.set(Metadata.LAST_MODIFIED, datetimeNoTimeZone);
+                // If Date/Time Original does not exist this might be creation date
+                if (original == null) {
+                    metadata.set(Metadata.DATE, datetimeNoTimeZone);
+                }
+            }
         }
-//      System.err.println(directory.getObject(tag.getTagType()) + " " + directory.getObject(tag.getTagType()).getClass());
-
-        // File info
-        // Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
-        if("Iptc".equals(tag.getDirectoryName())) {
-            if("Object Name".equals(tag.getTagName())) {
-                metadata.set(Metadata.TITLE, tag.getDescription());
-                return;
+    }
+    
+    /**
+     * Reads image comments, originally TIKA-472.
+     * Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
+     */
+    static class IptcHandler implements DirectoryHandler {
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return directoryType == IptcDirectory.class;
+        }
+        public void handle(Directory directory, Metadata metadata)
+                throws MetadataException {
+            if (directory.containsTag(IptcDirectory.TAG_KEYWORDS)) {
+                String[] keywords = directory.getStringArray(IptcDirectory.TAG_KEYWORDS);
+                for (String k : keywords) {
+                    metadata.add(Metadata.KEYWORDS, k);
+                }
+            }
+            if (directory.containsTag(IptcDirectory.TAG_HEADLINE)) {
+                metadata.set(Metadata.TITLE, directory.getString(IptcDirectory.TAG_HEADLINE));
+            } else if (directory.containsTag(IptcDirectory.TAG_OBJECT_NAME)) {
+                metadata.set(Metadata.TITLE, directory.getString(IptcDirectory.TAG_OBJECT_NAME));
             }
-            if("By-line".equals(tag.getTagName())) {
-                metadata.set(Metadata.AUTHOR, tag.getDescription());
-                return;
-            }		
-            if("Caption/Abstract".equals(tag.getTagName())) {
-                // Looks like metadata extractor returns IPTC newlines as a single carriage return,
-                // but the exiv2 command does not so we change to line feed here because that is less surprising to users
-                metadata.set(Metadata.DESCRIPTION, tag.getDescription().replaceAll("\r\n?", "\n"));
-                return;
+            if (directory.containsTag(IptcDirectory.TAG_BY_LINE)) {
+                metadata.set(Metadata.AUTHOR, directory.getString(IptcDirectory.TAG_BY_LINE));
+            }
+            if (directory.containsTag(IptcDirectory.TAG_CAPTION)) {
+                metadata.set(Metadata.DESCRIPTION,
+                        // Looks like metadata extractor returns IPTC newlines as a single carriage return,
+                        // but the exiv2 command does not so we change to line feed here because that is less surprising to users                        
+                        directory.getString(IptcDirectory.TAG_CAPTION).replaceAll("\r\n?", "\n"));
             }
         }
+    }
 
-        // EXIF / TIFF Tags
-        Property key = null;
-        if(tag.getTagName().equals("Image Width") ||
-                tag.getTagType() == 256) { 
-            key = Metadata.IMAGE_WIDTH;
-        }
-        if(tag.getTagName().equals("Image Height") ||
-                tag.getTagType() == 257) {
-            key = Metadata.IMAGE_LENGTH;
-        }
-        if(tag.getTagName().equals("Data Precision") ||
-                tag.getTagName().equals("Bits Per Sample") ||
-                tag.getTagType() == 258) {
-            key = Metadata.BITS_PER_SAMPLE;
-        }
-        if(tag.getTagType() == 277) {
-            key = Metadata.SAMPLES_PER_PIXEL;
-        }
+    /**
+     * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
+     */
+    static class GeotagHandler implements DirectoryHandler {
+        public boolean supports(Class<? extends Directory> directoryType) {
+            return directoryType == GpsDirectory.class;
+        }
+        public void handle(Directory directory, Metadata metadata) throws MetadataException {
+            String lat = directory.getDescription(GpsDirectory.TAG_GPS_LATITUDE);
+            String latNS = directory.getDescription(GpsDirectory.TAG_GPS_LATITUDE_REF);
+            if(lat != null) {
+                Double latitude = parseHMS(lat);
+                if(latitude != null) {
+                    if(latNS != null && latNS.equalsIgnoreCase("S") &&
+                            latitude > 0) {
+                        latitude *= -1;
+                    }
+                    metadata.set(Metadata.LATITUDE, LAT_LONG_FORMAT.format(latitude)); 
+                }
+            }
 
-        if(key != null) {
-            Matcher m = LEADING_NUMBERS.matcher(tag.getDescription());
-            if(m.matches()) {
-                metadata.set(key, m.group(1));
+            String lng = directory.getDescription(GpsDirectory.TAG_GPS_LONGITUDE);
+            String lngEW = directory.getDescription(GpsDirectory.TAG_GPS_LONGITUDE_REF);
+            if(lng != null) {
+                Double longitude = parseHMS(lng);
+                if(longitude != null) {
+                    if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
+                            longitude > 0) {
+                        longitude *= -1;
+                    }
+                    metadata.set(Metadata.LONGITUDE, LAT_LONG_FORMAT.format(longitude));
+                }
             }
         }
+        private Double parseHMS(String hms) {
+           Matcher m = HOURS_MINUTES_SECONDS.matcher(hms);
+           if(m.matches()) {
+              double value = 
+                Integer.parseInt(m.group(1)) +
+                (Integer.parseInt(m.group(2))/60.0) +
+                (Double.parseDouble(m.group(3))/60.0/60.0);
+              return value;
+           }
+           return null;
+        }
+        private static final Pattern HOURS_MINUTES_SECONDS = Pattern.compile("(-?\\d+)\"(\\d+)'(\\d+\\.?\\d*)");
+        /**
+         * The decimal format used for expressing latitudes and longitudes.
+         * The basic geo vocabulary defined by W3C (@see {@link Geographic})
+         * refers to the "float" type in XML Schema as the recommended format
+         * for latitude and longitude values.
+         */
+        private static final DecimalFormat LAT_LONG_FORMAT =
+            new DecimalFormat("##0.0####", new DecimalFormatSymbols(Locale.US));
     }
-    private static final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
+
 }

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java?rev=1033546&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java Wed Nov 10 15:57:34 2010
@@ -0,0 +1,58 @@
+package org.apache.tika.parser.image;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashSet;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
+/**
+ * Knowns about all declared {@link Metadata} fields.
+ * Didn't find this functionality anywhere so it was added for
+ * ImageMetadataExtractor, but it can be generalized.
+ */
+public abstract class MetadataFields {
+    
+    private static HashSet<String> known;
+    
+    static {
+        known = new HashSet<String>();
+        Field[] fields = Metadata.class.getFields();
+        for (Field f : fields) {
+            int mod = f.getModifiers();
+            if (Modifier.isPublic(mod) && Modifier.isStatic(mod) && Modifier.isFinal(mod)) {
+                Class<?> c = f.getType();
+                if (String.class.equals(c)) {
+                    try {
+                        String p = (String) f.get(null);
+                        if (p != null) {
+                            known.add(p);
+                        }
+                    } catch (IllegalArgumentException e) {
+                        e.printStackTrace();
+                    } catch (IllegalAccessException e) {
+                        e.printStackTrace();
+                    }
+                }
+                if (Property.class.isAssignableFrom(c)) {
+                    try {
+                        Property p = (Property) f.get(null);
+                        if (p != null) {
+                            known.add(p.getName());
+                        }
+                    } catch (IllegalArgumentException e) {
+                        e.printStackTrace();
+                    } catch (IllegalAccessException e) {
+                        e.printStackTrace();
+                    }
+                }
+            }
+        }
+    }
+    
+    public static boolean isMetadataField(String name) {
+        return known.contains(name);
+    }
+    
+}

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java Wed Nov 10 15:57:34 2010
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.image;
 
+import java.io.FilterInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Collections;
@@ -26,6 +27,7 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -52,13 +54,18 @@ public class TiffParser implements Parse
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        new ImageMetadataExtractor(metadata).parseTiff(stream);
 
-        for (String s : metadata.names()) {
-            if (s.startsWith("Unknown tag")) {
-                metadata.remove(s);
+        // read stream twice - exif and xmp extractors
+        stream.mark(Integer.MAX_VALUE);
+        FilterInputStream first = new FilterInputStream(stream) {
+            @Override
+            public void close() throws IOException {
             }
-        }
+        };
+        new ImageMetadataExtractor(metadata).parseTiff(first);
+        stream.reset();
+        
+        new JempboxExtractor(metadata).parse(stream);
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java Wed Nov 10 15:57:34 2010
@@ -16,6 +16,7 @@
  */
 package org.apache.tika.parser.jpeg;
 
+import java.io.FilterInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Collections;
@@ -27,6 +28,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.image.ImageMetadataExtractor;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -53,7 +55,18 @@ public class JpegParser implements Parse
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        new ImageMetadataExtractor(metadata).parseJpeg(stream);
+        
+        // read stream twice - exif and xmp extractors
+        stream.mark(Integer.MAX_VALUE);
+        FilterInputStream first = new FilterInputStream(stream) {
+            @Override
+            public void close() throws IOException {
+            }
+        };
+        new ImageMetadataExtractor(metadata).parseJpeg(first);
+        stream.reset();
+        
+        new JempboxExtractor(metadata).parse(stream);
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java?rev=1033546&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java Wed Nov 10 15:57:34 2010
@@ -0,0 +1,99 @@
+package org.apache.tika.parser.image;
+
+import java.util.Arrays;
+import java.util.GregorianCalendar;
+import java.util.Iterator;
+
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+import com.drew.metadata.exif.ExifDirectory;
+import com.drew.metadata.jpeg.JpegCommentDirectory;
+
+import junit.framework.TestCase;
+
+import static org.mockito.Mockito.*;
+
+public class ImageMetadataExtractorTest extends TestCase {
+    
+    @SuppressWarnings({ "rawtypes", "unchecked" })
+    public void testHandleDirectories() throws MetadataException {
+        Metadata metadata = mock(Metadata.class);
+        ImageMetadataExtractor.DirectoryHandler handler1 = mock(ImageMetadataExtractor.DirectoryHandler.class);
+        ImageMetadataExtractor e = new ImageMetadataExtractor(metadata, handler1);
+        
+        Directory directory = new JpegCommentDirectory();
+        Iterator directories = mock(Iterator.class);
+        when(directories.hasNext()).thenReturn(true, false);
+        when(directories.next()).thenReturn(directory);
+        when(handler1.supports(JpegCommentDirectory.class)).thenReturn(true);
+        
+        e.handle(directories);
+        verify(handler1).supports(JpegCommentDirectory.class);
+        verify(handler1).handle(directory, metadata);
+    }
+    
+    public void testExifHandlerSupports() {
+        assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifDirectory.class));
+        assertFalse(new ImageMetadataExtractor.ExifHandler().supports(Directory.class));
+        assertFalse(new ImageMetadataExtractor.ExifHandler().supports(JpegCommentDirectory.class));
+    }
+    
+    public void testExifHandlerParseDate() throws MetadataException {
+        ExifDirectory exif = mock(ExifDirectory.class);
+        when(exif.containsTag(ExifDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
+        when(exif.getDate(ExifDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(
+                new GregorianCalendar(2000, 0, 1, 0, 0, 0).getTime()); // jvm default timezone as in Metadata Extractor
+        Metadata metadata = new Metadata();
+        
+        new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+        assertEquals("Should be ISO date without time zone", "2000-01-01T00:00:00", metadata.get(DublinCore.DATE));
+    }
+
+    public void testExifHandlerParseDateFallback() throws MetadataException {
+        ExifDirectory exif = mock(ExifDirectory.class);
+        when(exif.containsTag(ExifDirectory.TAG_DATETIME)).thenReturn(true);
+        when(exif.getDate(ExifDirectory.TAG_DATETIME)).thenReturn(
+                new GregorianCalendar(1999, 0, 1, 0, 0, 0).getTime()); // jvm default timezone as in Metadata Extractor
+        Metadata metadata = new Metadata();
+        
+        new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+        assertEquals("Should try EXIF Date/Time if Original is not set", "1999-01-01T00:00:00", metadata.get(DublinCore.DATE));
+    }
+    
+    public void testExifHandlerParseDateError() throws MetadataException {
+        ExifDirectory exif = mock(ExifDirectory.class);
+        when(exif.containsTag(ExifDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
+        when(exif.getDate(ExifDirectory.TAG_DATETIME_ORIGINAL)).thenThrow(
+                new MetadataException("Tag 'X' cannot be cast to a java.util.Date."));
+        Metadata metadata = new Metadata();
+        
+        new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+        assertEquals("Parsing should proceed without date", null, metadata.get(DublinCore.DATE));
+    }
+    
+    public void testCopyUnknownFieldsHandler() throws MetadataException {
+        Directory d = mock(Directory.class);
+        Tag t1 = mock(Tag.class);
+        when(t1.getTagName()).thenReturn("Image Description");
+        when(t1.getDescription()).thenReturn("t1");
+        Tag t2 = mock(Tag.class);
+        when(t2.getTagName()).thenReturn(Metadata.KEYWORDS);
+        when(t2.getDescription()).thenReturn("known");
+        Tag t3 = mock(Tag.class);
+        when(t3.getTagName()).thenReturn(Metadata.DESCRIPTION);
+        when(t3.getDescription()).thenReturn("known");
+        Iterator<Tag> tags = Arrays.asList(t1, t2, t3).iterator();
+        when(d.getTagIterator()).thenReturn(tags);
+        Metadata metadata = new Metadata();
+        new ImageMetadataExtractor.CopyUnknownFieldsHandler().handle(d, metadata);
+        assertEquals("t1", metadata.get("Image Description"));
+        assertNull("keywords should be excluded from bulk copy because it is a defined field",
+                metadata.get(Metadata.KEYWORDS));
+        assertNull(metadata.get(Metadata.DESCRIPTION));
+    }
+    
+}

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java?rev=1033546&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java Wed Nov 10 15:57:34 2010
@@ -0,0 +1,17 @@
+package org.apache.tika.parser.image;
+
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.TIFF;
+
+import junit.framework.TestCase;
+
+public class MetadataFieldsTest extends TestCase {
+
+    public void testIsMetadataField() {
+        assertFalse(MetadataFields.isMetadataField("random string that is not a field"));
+        assertFalse(MetadataFields.isMetadataField("xyz"));
+        assertTrue(MetadataFields.isMetadataField(DublinCore.SUBJECT));
+        assertTrue(MetadataFields.isMetadataField(TIFF.F_NUMBER.getName()));
+    }
+
+}

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java Wed Nov 10 15:57:34 2010
@@ -23,6 +23,8 @@ import org.apache.tika.metadata.Metadata
 import org.xml.sax.helpers.DefaultHandler;
 
 import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
 
 public class TiffParserTest extends TestCase {
     private final Parser parser = new TiffParser();
@@ -37,15 +39,20 @@ public class TiffParserTest extends Test
         assertEquals("Licensed to the Apache Software Foundation (ASF) under one or " +
         		"more contributor license agreements.  See the NOTICE file " +
         		"distributed with this work for additional information regarding " +
-        		"copyright ownership.", metadata.get("Image Description"));
+        		"copyright ownership.", metadata.get(Metadata.DESCRIPTION));
         
         // All EXIF/TIFF tags
-        assertEquals("Inch", metadata.get("Resolution Unit"));
+        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
         
         // Core EXIF/TIFF tags
         assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
         assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
         assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
         assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL));
+        
+        // Embedded XMP
+        List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+        assertTrue("got " + subject, subject.contains("cat"));
+        assertTrue("got " + subject, subject.contains("garden"));
     }
 }

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java?rev=1033546&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java Wed Nov 10 15:57:34 2010
@@ -0,0 +1,79 @@
+package org.apache.tika.parser.image.xmp;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
+
+import junit.framework.TestCase;
+
+public class JempboxExtractorTest extends TestCase {
+   
+    public void testParseJpeg() throws IOException, TikaException {
+        Metadata metadata = new Metadata();
+        InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
+        // set some values before extraction to see that they are overridden
+        metadata.set(Metadata.TITLE, "old title");
+        metadata.set(Metadata.DESCRIPTION, "old description");
+        metadata.set(Metadata.CREATOR, "previous author");
+        // ... or kept in case the field is multi-value
+        metadata.add(Metadata.SUBJECT, "oldkeyword");
+        
+        JempboxExtractor extractor = new JempboxExtractor(metadata);
+        extractor.parse(stream);
+        
+        // DublinCore fields
+        assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+        assertEquals("Some Tourist", metadata.get(Metadata.CREATOR));
+        Collection<String> keywords = Arrays.asList(metadata.getValues(Metadata.SUBJECT));  
+        assertTrue(keywords.contains("oldkeyword"));
+        assertTrue(keywords.contains("grazelands"));
+        assertTrue(keywords.contains("nature reserve"));
+        assertTrue(keywords.contains("bird watching"));
+        assertTrue(keywords.contains("coast"));
+    }
+
+    public void testParseJpegPhotoshop() throws IOException, TikaException {
+        Metadata metadata = new Metadata();
+        InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
+       
+        JempboxExtractor extractor = new JempboxExtractor(metadata);
+        extractor.parse(stream);
+        
+        // DublinCore fields
+        assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+        assertEquals("Some Tourist", metadata.get(Metadata.CREATOR));
+        Collection<String> keywords = Arrays.asList(metadata.getValues(Metadata.SUBJECT));  
+        assertTrue(keywords.contains("bird watching"));
+        assertTrue(keywords.contains("coast"));
+    }
+    
+    public void testParseJpegXnviewmp() throws IOException, TikaException {
+        Metadata metadata = new Metadata();
+        InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
+       
+        JempboxExtractor extractor = new JempboxExtractor(metadata);
+        extractor.parse(stream);
+        
+        // XnViewMp fields not understood by Jempbox
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+        Collection<String> keywords = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+        assertTrue(keywords.contains("coast"));
+        assertTrue(keywords.contains("nature reserve"));
+    }
+    
+    public void testJoinCreators() {
+        assertEquals("Mr B", new JempboxExtractor(null).joinCreators(
+                Arrays.asList("Mr B")));
+        // TODO use multi-value property instead?
+        assertEquals("Mr B, Mr A", new JempboxExtractor(null).joinCreators(
+                Arrays.asList("Mr B", "Mr A")));
+    }
+
+}

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Wed Nov 10 15:57:34 2010
@@ -16,15 +16,18 @@
  */
 package org.apache.tika.parser.jpeg;
 
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
 import junit.framework.TestCase;
 
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TIFF;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
-import org.apache.tika.metadata.Metadata;
 import org.xml.sax.helpers.DefaultHandler;
 
-import java.io.InputStream;
-
 public class JpegParserTest extends TestCase {
     private final Parser parser = new JpegParser();
 
@@ -35,9 +38,6 @@ public class JpegParserTest extends Test
             getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
         parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
 
-        // All EXIF/TIFF tags
-        assertEquals("Canon EOS 40D", metadata.get("Model"));
-        
         // Core EXIF/TIFF tags
         assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
         assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
@@ -57,12 +57,20 @@ public class JpegParserTest extends Test
         assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
         assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
         
+        // Check that EXIF/TIFF tags come through with their raw values too
+        // (This may be removed for Tika 1.0, as we support more of them
+        //  with explicit Metadata entries)
+        assertEquals("Canon EOS 40D", metadata.get("Model"));
+        
         // Common tags
-        assertEquals("Date/Time for when the photo was taken, unspecified time zone",
-                "2009-10-02T23:02:49", metadata.get(Metadata.DATE));
+        //assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
         assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
-                "2009-08-11T09:09:45", metadata.get(Metadata.ORIGINAL_DATE));
-        assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
+                "2009-08-11T09:09:45", metadata.get(Metadata.DATE));
+        List<String> keywords = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+        assertTrue("'canon-55-250' expected in " + keywords, keywords.contains("canon-55-250"));
+        assertTrue("'moscow-birds' expected in " + keywords, keywords.contains("moscow-birds")); 
+        assertTrue("'serbor' expected in " + keywords, keywords.contains("serbor"));
+        assertFalse(keywords.contains("canon-55-250 moscow-birds serbor"));
     }
 
     public void testJPEGGeo() throws Exception {
@@ -75,9 +83,6 @@ public class JpegParserTest extends Test
         // Geo tags
         assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
         assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
-
-        // All EXIF/TIFF tags
-        assertEquals("Canon EOS 40D", metadata.get("Model"));
         
         // Core EXIF/TIFF tags
         assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
@@ -100,10 +105,12 @@ public class JpegParserTest extends Test
         
         // Common tags
         assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
-                "2009-08-11T09:09:45", metadata.get(Metadata.ORIGINAL_DATE));
+                "2009-08-11T09:09:45", metadata.get(Metadata.DATE));
         assertEquals("This image has different Date/Time than Date/Time Original, so it is probably modification date",
-                "2009-10-02T23:02:49", metadata.get(Metadata.DATE));
-        assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
+                "2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
+        assertEquals("Date/Time Original should be stored in EXIF field too",
+                "2009-08-11T09:09:45", metadata.get(TIFF.ORIGINAL_DATE));
+        assertEquals("canon-55-250", metadata.getValues(Metadata.KEYWORDS)[0]);
     }
     
     public void testJPEGTitleAndDescription() throws Exception {
@@ -114,13 +121,17 @@ public class JpegParserTest extends Test
         parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
           
         // embedded comments with non-ascii characters
-        //assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
-        assertEquals("Tosteberga " + new String(new byte[]{-61, -124}) + "ngar", metadata.get(Metadata.TITLE));
-        //assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
-        assertEquals("Bird site in north eastern Sk" + new String(new byte[]{-61, -91}) + 
-        		"ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+        assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
         assertEquals("Some Tourist", metadata.get(Metadata.AUTHOR));
-        assertEquals("grazelands nature reserve bird watching coast", metadata.get(Metadata.KEYWORDS));
+        assertEquals("Some Tourist", metadata.get(Metadata.CREATOR)); // Dublin Core
+        // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
+        // but we have to replace them with underscore
+        
+        List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
+        assertTrue(keywords.contains("coast"));
+        assertTrue(keywords.contains("bird watching"));
+        assertEquals(keywords, Arrays.asList(metadata.getValues(Metadata.SUBJECT)));
         
         // Core EXIF/TIFF tags
         assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH));
@@ -138,6 +149,39 @@ public class JpegParserTest extends Test
         assertEquals("1", metadata.get(Metadata.ORIENTATION)); // Not present
         assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
         assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
-        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+        assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));          
+    }
+    
+    public void testJPEGTitleAndDescriptionPhotoshop() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+            getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+          
+        // embedded comments with non-ascii characters
+        assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+        assertEquals("Some Tourist", metadata.get(Metadata.CREATOR));
+        List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+        assertTrue("got " + subject, subject.contains("bird watching")); 
+    }
+    
+    public void testJPEGTitleAndDescriptionXnviewmp() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+            getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+          
+        // XnViewMp's default comment dialog has only comment, not headline.
+        // Comment is embedded only if "Write comments in XMP" is enabled in settings
+        assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+        // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
+        // but we have to replace them with underscore
+        String[] subject = metadata.getValues(Metadata.SUBJECT);
+        List<String> keywords = Arrays.asList(subject);
+        assertTrue("'coast'" + " not in " + keywords, keywords.contains("coast"));
+        assertTrue("'nature reserve'" + " not in " + keywords, keywords.contains("nature reserve"));     
     }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_pspcs2mac.jpg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_pspcs2mac.jpg?rev=1033546&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_pspcs2mac.jpg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg?rev=1033546&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testTIFF.tif
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testTIFF.tif?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
Binary files - no diff available.