You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/03 17:01:13 UTC

svn commit: r992319 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/metadata/ tika-core/src/test/java/org/apache/tika/metadata/ tika-parsers/src/main/java/org/apache/tika/parser/image/ tika-parsers/src/main/java/org/apache/tika/parser/jpeg/ t...

Author: nick
Date: Fri Sep  3 15:01:12 2010
New Revision: 992319

URL: http://svn.apache.org/viewvc?rev=992319&view=rev
Log:
Apply Staffan Olsson's patch from TIKA-482 (with a few tweaks), which improves how EXIF metadata is processed from TIFF and JPEG files, and moves more of the Date properties to be real ISO8601 dates internally.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
Removed:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testXML.xml

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java Fri Sep  3 15:01:12 2010
@@ -82,9 +82,8 @@ public interface DublinCore {
      * the resource. Recommended best practice for encoding the date value is
      * defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD
      * format.
-     * TODO Make me a Date Property
      */
-    String DATE = "date";
+    Property DATE = Property.internalDate("date");
 
     /**
      * An account of the content of the resource. Description may include

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java Fri Sep  3 15:01:12 2010
@@ -51,4 +51,9 @@ public interface TIFF {
     Property SAMPLES_PER_PIXEL =
         Property.internalInteger("tiff:SamplesPerPixel");
 
+    /**
+     * "Date and time when original image was generated"
+     */
+    Property ORIGINAL_DATE =
+       Property.internalDate("exif:DateTimeOriginal");
 }

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java Fri Sep  3 15:01:12 2010
@@ -309,4 +309,17 @@ public class TestMetadata extends TestCa
         meta.set(Metadata.CREATION_DATE, "1969-12-31T12:00:01-12:00");
         assertEquals(1000, meta.getDate(Metadata.CREATION_DATE).getTime());
     }
+    
+    /**
+     * Some documents, like jpegs, might have date in unspecified time zone
+     * which should be handled like strings but verified to have parseable ISO 8601 format
+     */
+    public void testGetSetDateUnspecifiedTimezone() {
+        Metadata meta = new Metadata();    
+        
+        meta.set(Metadata.DATE, "1970-01-01T00:00:01");
+        assertEquals("should return string without time zone specifier because zone is not known",
+        		"1970-01-01T00:00:01", meta.get(Metadata.DATE));
+    }
+    
 }

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java?rev=992319&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java Fri Sep  3 15:01:12 2010
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.DecimalFormat;
+import java.text.DecimalFormatSymbols;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Geographic;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.SAXException;
+
+import com.drew.imaging.jpeg.JpegMetadataReader;
+import com.drew.imaging.jpeg.JpegProcessingException;
+import com.drew.imaging.tiff.TiffMetadataReader;
+import com.drew.imaging.tiff.TiffProcessingException;
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+
+public class ImageMetadataExtractor {
+
+    private final Metadata metadata;
+
+    public ImageMetadataExtractor(Metadata metadata) {
+        this.metadata = metadata;
+    }
+
+    public void parseTiff(InputStream stream)
+            throws IOException, SAXException, TikaException {
+        try {
+            com.drew.metadata.Metadata tiffMetadata =
+                TiffMetadataReader.readMetadata(stream);
+            parse(tiffMetadata);
+        } catch (TiffProcessingException e) {
+            throw new TikaException("Can't read TIFF metadata", e);
+        }
+    }
+    
+    public void parseJpeg(InputStream stream)
+            throws IOException, SAXException, TikaException {
+       try {
+          com.drew.metadata.Metadata jpegMetadata =
+             JpegMetadataReader.readMetadata(stream);
+          parse(jpegMetadata);
+       } catch (JpegProcessingException e) {
+          throw new TikaException("Can't read JPEG metadata", e);
+       }
+    }
+    
+    protected void parse(com.drew.metadata.Metadata imageMetadata)
+            throws IOException, SAXException, TikaException {
+       try {
+          Iterator<?> directories = imageMetadata.getDirectoryIterator();
+          while (directories.hasNext()) {
+             Directory directory = (Directory) directories.next();
+             Iterator<?> tags = directory.getTagIterator();
+
+             while (tags.hasNext()) {
+                Tag tag = (Tag)tags.next();
+                metadata.set(tag.getTagName(), tag.getDescription());
+                handleCommonImageTags(metadata, tag);
+             }
+             handleGeoImageTags(metadata);
+          }
+       } catch (MetadataException e) {
+          throw new TikaException("Can't read TIFF/JPEG metadata", e);
+       }
+    }
+
+    /**
+     * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
+     * Needs to be run at the end, because the GPS information
+     *  is spread across several EXIF tags.
+     */
+    public static void handleGeoImageTags(Metadata metadata) {
+        String lat = metadata.get("GPS Latitude");
+        String latNS = metadata.get("GPS Latitude Ref");
+        if(lat != null) {
+            Double latitude = parseHMS(lat);
+            if(latitude != null) {
+                if(latNS != null && latNS.equalsIgnoreCase("S") &&
+                        latitude > 0) {
+                    latitude *= -1;
+                }
+                metadata.set(Metadata.LATITUDE, LAT_LONG_FORMAT.format(latitude)); 
+            }
+        }
+
+        String lng = metadata.get("GPS Longitude");
+        String lngEW = metadata.get("GPS Longitude Ref");
+        if(lng != null) {
+            Double longitude = parseHMS(lng);
+            if(longitude != null) {
+                if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
+                        longitude > 0) {
+                    longitude *= -1;
+                }
+                metadata.set(Metadata.LONGITUDE, LAT_LONG_FORMAT.format(longitude));
+            }
+        }
+    }
+    private static Double parseHMS(String hms) {
+       Matcher m = HOURS_MINUTES_SECONDS.matcher(hms);
+       if(m.matches()) {
+          double value = 
+            Integer.parseInt(m.group(1)) +
+            (Integer.parseInt(m.group(2))/60.0) +
+            (Double.parseDouble(m.group(3))/60.0/60.0);
+          return value;
+       }
+       return null;
+    }
+    private static final Pattern HOURS_MINUTES_SECONDS = Pattern.compile("(-?\\d+)\"(\\d+)'(\\d+\\.?\\d*)");
+    /**
+     * The decimal format used for expressing latitudes and longitudes.
+     * The basic geo vocabulary defined by W3C (@see {@link Geographic})
+     * refers to the "float" type in XML Schema as the recommended format
+     * for latitude and longitude values.
+     */
+    private static final DecimalFormat LAT_LONG_FORMAT =
+        new DecimalFormat("##0.0####", new DecimalFormatSymbols(Locale.US));
+
+    private static void handleDate(Metadata metadata, Property property, Tag tag) throws MetadataException {
+       // Ensure it's in the right format
+       String date = tag.getDescription();
+       int splitAt = date.indexOf(' '); 
+       if(splitAt > -1) {
+           String datePart = date.substring(0, splitAt);
+           String timePart = date.substring(splitAt+1);
+           date = datePart.replace(':', '-') + 'T' + timePart;
+       }
+       metadata.set(property, date);
+    }
+
+    /**
+     * Maps common TIFF and EXIF tags onto the Tika
+     *  TIFF image metadata namespace.
+     */
+    public static void handleCommonImageTags(Metadata metadata, Tag tag) throws MetadataException {
+        // Core tags
+        if(tag.getTagName().equals("Date/Time") ||
+                tag.getTagType() == 306) {
+            handleDate(metadata, Metadata.DATE, tag);
+            handleDate(metadata, Metadata.LAST_MODIFIED, tag);
+            return;
+        }
+        if(tag.getTagName().equals("Date/Time Original") ||
+              tag.getTagType() == 36867) {
+          handleDate(metadata, Metadata.ORIGINAL_DATE, tag);
+          return;
+      }
+        if(tag.getTagName().equals("Keywords") ||
+                tag.getTagType() == 537) {
+            metadata.set(Metadata.KEYWORDS, tag.getDescription());
+            return;
+        }
+        if(tag.getTagName().equals("Jpeg Comment")) {
+            metadata.set(Metadata.COMMENTS, tag.getDescription());
+            return;
+        }
+
+        // File info
+        // Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
+        if("Iptc".equals(tag.getDirectoryName())) {
+            if("Object Name".equals(tag.getTagName())) {
+                metadata.set(Metadata.TITLE, tag.getDescription());
+                return;
+            }
+            if("By-line".equals(tag.getTagName())) {
+                metadata.set(Metadata.AUTHOR, tag.getDescription());
+                return;
+            }		
+            if("Caption/Abstract".equals(tag.getTagName())) {
+                // Looks like metadata extractor returns IPTC newlines as a single carriage return,
+                // but the exiv2 command does not so we change to line feed here because that is less surprising to users
+                metadata.set(Metadata.DESCRIPTION, tag.getDescription().replaceAll("\r\n?", "\n"));
+                return;
+            }
+        }
+
+        // EXIF / TIFF Tags
+        Property key = null;
+        if(tag.getTagName().equals("Image Width") ||
+                tag.getTagType() == 256) { 
+            key = Metadata.IMAGE_WIDTH;
+        }
+        if(tag.getTagName().equals("Image Height") ||
+                tag.getTagType() == 257) {
+            key = Metadata.IMAGE_LENGTH;
+        }
+        if(tag.getTagName().equals("Data Precision") ||
+                tag.getTagName().equals("Bits Per Sample") ||
+                tag.getTagType() == 258) {
+            key = Metadata.BITS_PER_SAMPLE;
+        }
+        if(tag.getTagType() == 277) {
+            key = Metadata.SAMPLES_PER_PIXEL;
+        }
+
+        if(key != null) {
+            Matcher m = LEADING_NUMBERS.matcher(tag.getDescription());
+            if(m.matches()) {
+                metadata.set(key, m.group(1));
+            }
+        }
+    }
+    private static final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
+}

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java Fri Sep  3 15:01:12 2010
@@ -52,7 +52,7 @@ public class TiffParser implements Parse
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        new TiffExtractor(metadata).parse(stream);
+        new ImageMetadataExtractor(metadata).parseTiff(stream);
 
         for (String s : metadata.names()) {
             if (s.startsWith("Unknown tag")) {

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java Fri Sep  3 15:01:12 2010
@@ -26,6 +26,7 @@ import org.apache.tika.metadata.Metadata
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.image.ImageMetadataExtractor;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -52,7 +53,7 @@ public class JpegParser implements Parse
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        new JpegExtractor(metadata).parse(stream);
+        new ImageMetadataExtractor(metadata).parseJpeg(stream);
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Fri Sep  3 15:01:12 2010
@@ -21,7 +21,11 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.UnsupportedEncodingException;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
 import java.util.Collections;
+import java.util.Date;
+import java.util.Locale;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -213,8 +217,11 @@ public class MboxParser implements Parse
             metadata.add(Metadata.SUBJECT, headerContent);
             metadata.add(Metadata.TITLE, headerContent);
         } else if (headerTag.equalsIgnoreCase("Date")) {
-            // TODO - parse and convert to ISO format YYYY-MM-DD
-            metadata.add(Metadata.DATE, headerContent);
+            try {
+                metadata.set(Metadata.DATE, parseDate(headerContent));
+            } catch (ParseException e) {
+                // ignoring date because format was not understood
+            }
         } else if (headerTag.equalsIgnoreCase("Message-Id")) {
             metadata.add(Metadata.IDENTIFIER, headerContent);
         } else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
@@ -229,6 +236,11 @@ public class MboxParser implements Parse
             metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
         }
     }
+    
+    private Date parseDate(String headerContent) throws ParseException {
+        SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
+        return dateFormat.parse(headerContent);
+    }
 
     public void parse(
             InputStream stream, ContentHandler handler, Metadata metadata)

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java Fri Sep  3 15:01:12 2010
@@ -61,7 +61,7 @@ public class MetadataExtractor {
         addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
                 .getContentStatusProperty());
         addProperty(metadata, Metadata.DATE, propsHolder
-                .getCreatedPropertyString());
+                .getCreatedProperty());
         addProperty(metadata, Metadata.CREATION_DATE, propsHolder
                 .getCreatedProperty());
         addProperty(metadata, Metadata.CREATOR, propsHolder

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java Fri Sep  3 15:01:12 2010
@@ -51,7 +51,12 @@ public class DcXMLParser extends XMLPars
         ch = getDublinCore(ch, md, DublinCore.DESCRIPTION, "description");
         ch = getDublinCore(ch, md, DublinCore.PUBLISHER, "publisher");
         ch = getDublinCore(ch, md, DublinCore.CONTRIBUTOR, "contributor");
-        ch = getDublinCore(ch, md, DublinCore.DATE, "date");
+        try {
+            ch = getDublinCore(ch, md, DublinCore.DATE.getName(), "date");
+        } catch (Exception e) {
+            // Date format and parsing behavior was undefined and untested when DublinCare
+            // date was converted to Property.internalDate so we silently skip date on parse error
+        }
         ch = getDublinCore(ch, md, DublinCore.TYPE, "type");
         ch = getDublinCore(ch, md, DublinCore.FORMAT, "format");
         ch = getDublinCore(ch, md, DublinCore.IDENTIFIER, "identifier");

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java Fri Sep  3 15:01:12 2010
@@ -151,21 +151,4 @@ public class ImageParserTest extends Tes
         assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
     }
 
-// TODO: Add TIFF support
-//    public void testTIFF() throws Exception {
-//        Metadata metadata = new Metadata();
-//        metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
-//        InputStream stream =
-//            getClass().getResourceAsStream("/test-documents/testTIFF.tif");
-//        parser.parse(stream, new DefaultHandler(), metadata);
-//
-//        assertEquals("75", metadata.get("height"));
-//        assertEquals("100", metadata.get("width"));
-//    
-//        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
-//        assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
-//        assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
-//        assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements.  See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(Metadata.COMMENTS));
-//    }
-
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Fri Sep  3 15:01:12 2010
@@ -45,7 +45,10 @@ public class JpegParserTest extends Test
         assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
         
         // Common tags
-        assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
+        assertEquals("Date/Time for when the photo was taken, unspecified time zone",
+                "2009-10-02T23:02:49", metadata.get(Metadata.DATE));
+        assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
+                "2009-08-11T09:09:45", metadata.get(Metadata.ORIGINAL_DATE));
         assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
     }
 
@@ -70,7 +73,10 @@ public class JpegParserTest extends Test
         assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
         
         // Common tags
-        assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
+        assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
+                "2009-08-11T09:09:45", metadata.get(Metadata.ORIGINAL_DATE));
+        assertEquals("This image has different Date/Time than Date/Time Original, so it is probably modification date",
+                "2009-10-02T23:02:49", metadata.get(Metadata.DATE));
         assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
     }
     
@@ -88,9 +94,6 @@ public class JpegParserTest extends Test
         assertEquals("Bird site in north eastern Sk" + new String(new byte[]{-61, -91}) + 
         		"ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
         assertEquals("Some Tourist", metadata.get(Metadata.AUTHOR));
-        // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
-        //assertEquals("bird watching nature reserve coast grazelands", metadata.get(Metadata.KEYWORDS));
-        // ordering is odd when returned from parser as one string
         assertEquals("grazelands nature reserve bird watching coast", metadata.get(Metadata.KEYWORDS));
     }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java Fri Sep  3 15:01:12 2010
@@ -75,7 +75,8 @@ public class MboxParserTest extends Test
             assertEquals("<au...@domain.com>", metadata.get(Metadata.CREATOR));
             assertEquals(null, metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
             assertEquals("<na...@domain.com>", metadata.get("MboxParser-return-path"));
-            assertEquals("Tue, 9 Jun 2009 23:58:45 -0400", metadata.get(Metadata.DATE));
+            assertEquals("Should be ISO date in UTC, converted from 'Tue, 9 Jun 2009 23:58:45 -0400'", 
+                    "2009-06-10T03:58:45Z", metadata.get(Metadata.DATE));
         } catch (Exception e) {
             fail("Exception thrown: " + e.getMessage());
         }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Fri Sep  3 15:01:12 2010
@@ -56,6 +56,8 @@ public class DcXMLParserTest extends Tes
 
             String content = handler.toString();
             assertTrue(content.contains("Tika test document"));
+            
+            assertEquals("2000-12-01T00:00:00.000Z", metadata.get(Metadata.DATE));
         } finally {
             input.close();
         }

Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testXML.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testXML.xml?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testXML.xml (original)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testXML.xml Fri Sep  3 15:01:12 2010
@@ -35,7 +35,7 @@
 
 	<dc:identifier>http://www.apache.org</dc:identifier>
 
-	<dc:date>2000-12</dc:date>
+	<dc:date>2000-12-01T00:00:00.000Z</dc:date>
 
 	<dc:type>test</dc:type>