You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/03 17:01:13 UTC
svn commit: r992319 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/metadata/
tika-core/src/test/java/org/apache/tika/metadata/
tika-parsers/src/main/java/org/apache/tika/parser/image/
tika-parsers/src/main/java/org/apache/tika/parser/jpeg/ t...
Author: nick
Date: Fri Sep 3 15:01:12 2010
New Revision: 992319
URL: http://svn.apache.org/viewvc?rev=992319&view=rev
Log:
Apply Staffan Olsson's patch from TIKA-482 (with a few tweaks), which improves how EXIF metadata is processed from TIFF and JPEG files, and moves more of the Date properties to be real ISO8601 dates internally.
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
Removed:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testXML.xml
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java Fri Sep 3 15:01:12 2010
@@ -82,9 +82,8 @@ public interface DublinCore {
* the resource. Recommended best practice for encoding the date value is
* defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD
* format.
- * TODO Make me a Date Property
*/
- String DATE = "date";
+ Property DATE = Property.internalDate("date");
/**
* An account of the content of the resource. Description may include
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java Fri Sep 3 15:01:12 2010
@@ -51,4 +51,9 @@ public interface TIFF {
Property SAMPLES_PER_PIXEL =
Property.internalInteger("tiff:SamplesPerPixel");
+ /**
+ * "Date and time when original image was generated"
+ */
+ Property ORIGINAL_DATE =
+ Property.internalDate("exif:DateTimeOriginal");
}
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java Fri Sep 3 15:01:12 2010
@@ -309,4 +309,17 @@ public class TestMetadata extends TestCa
meta.set(Metadata.CREATION_DATE, "1969-12-31T12:00:01-12:00");
assertEquals(1000, meta.getDate(Metadata.CREATION_DATE).getTime());
}
+
+ /**
+ * Some documents, like jpegs, might have date in unspecified time zone
+ * which should be handled like strings but verified to have parseable ISO 8601 format
+ */
+ public void testGetSetDateUnspecifiedTimezone() {
+ Metadata meta = new Metadata();
+
+ meta.set(Metadata.DATE, "1970-01-01T00:00:01");
+ assertEquals("should return string without time zone specifier because zone is not known",
+ "1970-01-01T00:00:01", meta.get(Metadata.DATE));
+ }
+
}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java?rev=992319&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java Fri Sep 3 15:01:12 2010
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.image;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.DecimalFormat;
+import java.text.DecimalFormatSymbols;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Geographic;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.xml.sax.SAXException;
+
+import com.drew.imaging.jpeg.JpegMetadataReader;
+import com.drew.imaging.jpeg.JpegProcessingException;
+import com.drew.imaging.tiff.TiffMetadataReader;
+import com.drew.imaging.tiff.TiffProcessingException;
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+
+public class ImageMetadataExtractor {
+
+ private final Metadata metadata;
+
+ public ImageMetadataExtractor(Metadata metadata) {
+ this.metadata = metadata;
+ }
+
+ public void parseTiff(InputStream stream)
+ throws IOException, SAXException, TikaException {
+ try {
+ com.drew.metadata.Metadata tiffMetadata =
+ TiffMetadataReader.readMetadata(stream);
+ parse(tiffMetadata);
+ } catch (TiffProcessingException e) {
+ throw new TikaException("Can't read TIFF metadata", e);
+ }
+ }
+
+ public void parseJpeg(InputStream stream)
+ throws IOException, SAXException, TikaException {
+ try {
+ com.drew.metadata.Metadata jpegMetadata =
+ JpegMetadataReader.readMetadata(stream);
+ parse(jpegMetadata);
+ } catch (JpegProcessingException e) {
+ throw new TikaException("Can't read JPEG metadata", e);
+ }
+ }
+
+ protected void parse(com.drew.metadata.Metadata imageMetadata)
+ throws IOException, SAXException, TikaException {
+ try {
+ Iterator<?> directories = imageMetadata.getDirectoryIterator();
+ while (directories.hasNext()) {
+ Directory directory = (Directory) directories.next();
+ Iterator<?> tags = directory.getTagIterator();
+
+ while (tags.hasNext()) {
+ Tag tag = (Tag)tags.next();
+ metadata.set(tag.getTagName(), tag.getDescription());
+ handleCommonImageTags(metadata, tag);
+ }
+ handleGeoImageTags(metadata);
+ }
+ } catch (MetadataException e) {
+ throw new TikaException("Can't read TIFF/JPEG metadata", e);
+ }
+ }
+
+ /**
+ * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
+ * Needs to be run at the end, because the GPS information
+ * is spread across several EXIF tags.
+ */
+ public static void handleGeoImageTags(Metadata metadata) {
+ String lat = metadata.get("GPS Latitude");
+ String latNS = metadata.get("GPS Latitude Ref");
+ if(lat != null) {
+ Double latitude = parseHMS(lat);
+ if(latitude != null) {
+ if(latNS != null && latNS.equalsIgnoreCase("S") &&
+ latitude > 0) {
+ latitude *= -1;
+ }
+ metadata.set(Metadata.LATITUDE, LAT_LONG_FORMAT.format(latitude));
+ }
+ }
+
+ String lng = metadata.get("GPS Longitude");
+ String lngEW = metadata.get("GPS Longitude Ref");
+ if(lng != null) {
+ Double longitude = parseHMS(lng);
+ if(longitude != null) {
+ if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
+ longitude > 0) {
+ longitude *= -1;
+ }
+ metadata.set(Metadata.LONGITUDE, LAT_LONG_FORMAT.format(longitude));
+ }
+ }
+ }
+ private static Double parseHMS(String hms) {
+ Matcher m = HOURS_MINUTES_SECONDS.matcher(hms);
+ if(m.matches()) {
+ double value =
+ Integer.parseInt(m.group(1)) +
+ (Integer.parseInt(m.group(2))/60.0) +
+ (Double.parseDouble(m.group(3))/60.0/60.0);
+ return value;
+ }
+ return null;
+ }
+ private static final Pattern HOURS_MINUTES_SECONDS = Pattern.compile("(-?\\d+)\"(\\d+)'(\\d+\\.?\\d*)");
+ /**
+ * The decimal format used for expressing latitudes and longitudes.
+ * The basic geo vocabulary defined by W3C (@see {@link Geographic})
+ * refers to the "float" type in XML Schema as the recommended format
+ * for latitude and longitude values.
+ */
+ private static final DecimalFormat LAT_LONG_FORMAT =
+ new DecimalFormat("##0.0####", new DecimalFormatSymbols(Locale.US));
+
+ private static void handleDate(Metadata metadata, Property property, Tag tag) throws MetadataException {
+ // Ensure it's in the right format
+ String date = tag.getDescription();
+ int splitAt = date.indexOf(' ');
+ if(splitAt > -1) {
+ String datePart = date.substring(0, splitAt);
+ String timePart = date.substring(splitAt+1);
+ date = datePart.replace(':', '-') + 'T' + timePart;
+ }
+ metadata.set(property, date);
+ }
+
+ /**
+ * Maps common TIFF and EXIF tags onto the Tika
+ * TIFF image metadata namespace.
+ */
+ public static void handleCommonImageTags(Metadata metadata, Tag tag) throws MetadataException {
+ // Core tags
+ if(tag.getTagName().equals("Date/Time") ||
+ tag.getTagType() == 306) {
+ handleDate(metadata, Metadata.DATE, tag);
+ handleDate(metadata, Metadata.LAST_MODIFIED, tag);
+ return;
+ }
+ if(tag.getTagName().equals("Date/Time Original") ||
+ tag.getTagType() == 36867) {
+ handleDate(metadata, Metadata.ORIGINAL_DATE, tag);
+ return;
+ }
+ if(tag.getTagName().equals("Keywords") ||
+ tag.getTagType() == 537) {
+ metadata.set(Metadata.KEYWORDS, tag.getDescription());
+ return;
+ }
+ if(tag.getTagName().equals("Jpeg Comment")) {
+ metadata.set(Metadata.COMMENTS, tag.getDescription());
+ return;
+ }
+
+ // File info
+ // Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
+ if("Iptc".equals(tag.getDirectoryName())) {
+ if("Object Name".equals(tag.getTagName())) {
+ metadata.set(Metadata.TITLE, tag.getDescription());
+ return;
+ }
+ if("By-line".equals(tag.getTagName())) {
+ metadata.set(Metadata.AUTHOR, tag.getDescription());
+ return;
+ }
+ if("Caption/Abstract".equals(tag.getTagName())) {
+ // Looks like metadata extractor returns IPTC newlines as a single carriage return,
+ // but the exiv2 command does not so we change to line feed here because that is less surprising to users
+ metadata.set(Metadata.DESCRIPTION, tag.getDescription().replaceAll("\r\n?", "\n"));
+ return;
+ }
+ }
+
+ // EXIF / TIFF Tags
+ Property key = null;
+ if(tag.getTagName().equals("Image Width") ||
+ tag.getTagType() == 256) {
+ key = Metadata.IMAGE_WIDTH;
+ }
+ if(tag.getTagName().equals("Image Height") ||
+ tag.getTagType() == 257) {
+ key = Metadata.IMAGE_LENGTH;
+ }
+ if(tag.getTagName().equals("Data Precision") ||
+ tag.getTagName().equals("Bits Per Sample") ||
+ tag.getTagType() == 258) {
+ key = Metadata.BITS_PER_SAMPLE;
+ }
+ if(tag.getTagType() == 277) {
+ key = Metadata.SAMPLES_PER_PIXEL;
+ }
+
+ if(key != null) {
+ Matcher m = LEADING_NUMBERS.matcher(tag.getDescription());
+ if(m.matches()) {
+ metadata.set(key, m.group(1));
+ }
+ }
+ }
+ private static final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
+}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java Fri Sep 3 15:01:12 2010
@@ -52,7 +52,7 @@ public class TiffParser implements Parse
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- new TiffExtractor(metadata).parse(stream);
+ new ImageMetadataExtractor(metadata).parseTiff(stream);
for (String s : metadata.names()) {
if (s.startsWith("Unknown tag")) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java Fri Sep 3 15:01:12 2010
@@ -26,6 +26,7 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.image.ImageMetadataExtractor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -52,7 +53,7 @@ public class JpegParser implements Parse
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- new JpegExtractor(metadata).parse(stream);
+ new ImageMetadataExtractor(metadata).parseJpeg(stream);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Fri Sep 3 15:01:12 2010
@@ -21,7 +21,11 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
import java.util.Collections;
+import java.util.Date;
+import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -213,8 +217,11 @@ public class MboxParser implements Parse
metadata.add(Metadata.SUBJECT, headerContent);
metadata.add(Metadata.TITLE, headerContent);
} else if (headerTag.equalsIgnoreCase("Date")) {
- // TODO - parse and convert to ISO format YYYY-MM-DD
- metadata.add(Metadata.DATE, headerContent);
+ try {
+ metadata.set(Metadata.DATE, parseDate(headerContent));
+ } catch (ParseException e) {
+ // ignoring date because format was not understood
+ }
} else if (headerTag.equalsIgnoreCase("Message-Id")) {
metadata.add(Metadata.IDENTIFIER, headerContent);
} else if (headerTag.equalsIgnoreCase("In-Reply-To")) {
@@ -229,6 +236,11 @@ public class MboxParser implements Parse
metadata.add(EMAIL_HEADER_METADATA_PREFIX + headerTag, headerContent);
}
}
+
+ private Date parseDate(String headerContent) throws ParseException {
+ SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
+ return dateFormat.parse(headerContent);
+ }
public void parse(
InputStream stream, ContentHandler handler, Metadata metadata)
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java Fri Sep 3 15:01:12 2010
@@ -61,7 +61,7 @@ public class MetadataExtractor {
addProperty(metadata, Metadata.CONTENT_STATUS, propsHolder
.getContentStatusProperty());
addProperty(metadata, Metadata.DATE, propsHolder
- .getCreatedPropertyString());
+ .getCreatedProperty());
addProperty(metadata, Metadata.CREATION_DATE, propsHolder
.getCreatedProperty());
addProperty(metadata, Metadata.CREATOR, propsHolder
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/xml/DcXMLParser.java Fri Sep 3 15:01:12 2010
@@ -51,7 +51,12 @@ public class DcXMLParser extends XMLPars
ch = getDublinCore(ch, md, DublinCore.DESCRIPTION, "description");
ch = getDublinCore(ch, md, DublinCore.PUBLISHER, "publisher");
ch = getDublinCore(ch, md, DublinCore.CONTRIBUTOR, "contributor");
- ch = getDublinCore(ch, md, DublinCore.DATE, "date");
+ try {
+ ch = getDublinCore(ch, md, DublinCore.DATE.getName(), "date");
+ } catch (Exception e) {
+ // Date format and parsing behavior was undefined and untested when DublinCare
+ // date was converted to Property.internalDate so we silently skip date on parse error
+ }
ch = getDublinCore(ch, md, DublinCore.TYPE, "type");
ch = getDublinCore(ch, md, DublinCore.FORMAT, "format");
ch = getDublinCore(ch, md, DublinCore.IDENTIFIER, "identifier");
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java Fri Sep 3 15:01:12 2010
@@ -151,21 +151,4 @@ public class ImageParserTest extends Tes
assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
}
-// TODO: Add TIFF support
-// public void testTIFF() throws Exception {
-// Metadata metadata = new Metadata();
-// metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
-// InputStream stream =
-// getClass().getResourceAsStream("/test-documents/testTIFF.tif");
-// parser.parse(stream, new DefaultHandler(), metadata);
-//
-// assertEquals("75", metadata.get("height"));
-// assertEquals("100", metadata.get("width"));
-//
-// assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
-// assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
-// assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
-// assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(Metadata.COMMENTS));
-// }
-
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Fri Sep 3 15:01:12 2010
@@ -45,7 +45,10 @@ public class JpegParserTest extends Test
assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
// Common tags
- assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
+ assertEquals("Date/Time for when the photo was taken, unspecified time zone",
+ "2009-10-02T23:02:49", metadata.get(Metadata.DATE));
+ assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
+ "2009-08-11T09:09:45", metadata.get(Metadata.ORIGINAL_DATE));
assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
}
@@ -70,7 +73,10 @@ public class JpegParserTest extends Test
assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
// Common tags
- assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
+ assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
+ "2009-08-11T09:09:45", metadata.get(Metadata.ORIGINAL_DATE));
+ assertEquals("This image has different Date/Time than Date/Time Original, so it is probably modification date",
+ "2009-10-02T23:02:49", metadata.get(Metadata.DATE));
assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
}
@@ -88,9 +94,6 @@ public class JpegParserTest extends Test
assertEquals("Bird site in north eastern Sk" + new String(new byte[]{-61, -91}) +
"ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
assertEquals("Some Tourist", metadata.get(Metadata.AUTHOR));
- // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
- //assertEquals("bird watching nature reserve coast grazelands", metadata.get(Metadata.KEYWORDS));
- // ordering is odd when returned from parser as one string
assertEquals("grazelands nature reserve bird watching coast", metadata.get(Metadata.KEYWORDS));
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java Fri Sep 3 15:01:12 2010
@@ -75,7 +75,8 @@ public class MboxParserTest extends Test
assertEquals("<au...@domain.com>", metadata.get(Metadata.CREATOR));
assertEquals(null, metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
assertEquals("<na...@domain.com>", metadata.get("MboxParser-return-path"));
- assertEquals("Tue, 9 Jun 2009 23:58:45 -0400", metadata.get(Metadata.DATE));
+ assertEquals("Should be ISO date in UTC, converted from 'Tue, 9 Jun 2009 23:58:45 -0400'",
+ "2009-06-10T03:58:45Z", metadata.get(Metadata.DATE));
} catch (Exception e) {
fail("Exception thrown: " + e.getMessage());
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/xml/DcXMLParserTest.java Fri Sep 3 15:01:12 2010
@@ -56,6 +56,8 @@ public class DcXMLParserTest extends Tes
String content = handler.toString();
assertTrue(content.contains("Tika test document"));
+
+ assertEquals("2000-12-01T00:00:00.000Z", metadata.get(Metadata.DATE));
} finally {
input.close();
}
Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testXML.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testXML.xml?rev=992319&r1=992318&r2=992319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testXML.xml (original)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testXML.xml Fri Sep 3 15:01:12 2010
@@ -35,7 +35,7 @@
<dc:identifier>http://www.apache.org</dc:identifier>
- <dc:date>2000-12</dc:date>
+ <dc:date>2000-12-01T00:00:00.000Z</dc:date>
<dc:type>test</dc:type>