You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/11/10 16:57:34 UTC
svn commit: r1033546 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/image/
main/java/org/apache/tika/parser/jpeg/
test/java/org/apache/tika/parser/image/
test/java/org/apache/tika/parser/image/xmp/
test/java/org/apache/tika/parser/...
Author: nick
Date: Wed Nov 10 15:57:34 2010
New Revision: 1033546
URL: http://svn.apache.org/viewvc?rev=1033546&view=rev
Log:
Improved extraction of EXIF and IPTC metadata from JPEG and TIFF Images (TIKA-482)
(Applys patch from Staffan Olsson from TIKA-482)
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_pspcs2mac.jpg (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testTIFF.tif
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageMetadataExtractor.java Wed Nov 10 15:57:34 2010
@@ -41,303 +41,439 @@ import com.drew.lang.Rational;
import com.drew.metadata.Directory;
import com.drew.metadata.MetadataException;
import com.drew.metadata.Tag;
-
+import com.drew.metadata.exif.ExifDirectory;
+import com.drew.metadata.exif.GpsDirectory;
+import com.drew.metadata.iptc.IptcDirectory;
+import com.drew.metadata.jpeg.JpegCommentDirectory;
+import com.drew.metadata.jpeg.JpegDirectory;
+
+/**
+ * Uses the <a href="http://www.drewnoakes.com/code/exif/">Metadata Extractor</a> library
+ * to read EXIF and IPTC image metadata and map to Tika fields.
+ *
+ * As of 2.4.0 the library supports jpeg and tiff.
+ */
public class ImageMetadataExtractor {
private final Metadata metadata;
+ private DirectoryHandler[] handlers;
+ /**
+ * @param metadata to extract to, using default directory handlers
+ */
public ImageMetadataExtractor(Metadata metadata) {
+ this(metadata,
+ new CopyUnknownFieldsHandler(),
+ new JpegCommentHandler(),
+ new ExifHandler(),
+ new DimensionsHandler(),
+ new GeotagHandler(),
+ new IptcHandler()
+ );
+ }
+
+ /**
+ * @param metadata to extract to
+ * @param handlers handlers in order, note that handlers may override values from earlier handlers
+ */
+ public ImageMetadataExtractor(Metadata metadata, DirectoryHandler... handlers) {
this.metadata = metadata;
+ this.handlers = handlers;
}
- public void parseTiff(InputStream stream)
+ public void parseJpeg(InputStream stream)
throws IOException, SAXException, TikaException {
try {
- com.drew.metadata.Metadata tiffMetadata =
- TiffMetadataReader.readMetadata(stream);
- parse(tiffMetadata);
- } catch (TiffProcessingException e) {
- throw new TikaException("Can't read TIFF metadata", e);
+ com.drew.metadata.Metadata jpegMetadata =
+ JpegMetadataReader.readMetadata(stream);
+
+ handle(jpegMetadata);
+ } catch (JpegProcessingException e) {
+ throw new TikaException("Can't read JPEG metadata", e);
+ } catch (MetadataException e) {
+ throw new TikaException("Can't read JPEG metadata", e);
}
}
- public void parseJpeg(InputStream stream)
- throws IOException, SAXException, TikaException {
- try {
- com.drew.metadata.Metadata jpegMetadata =
- JpegMetadataReader.readMetadata(stream);
- parse(jpegMetadata);
- } catch (JpegProcessingException e) {
- throw new TikaException("Can't read JPEG metadata", e);
- }
- }
-
- protected void parse(com.drew.metadata.Metadata imageMetadata)
+ protected void parseTiff(InputStream stream)
throws IOException, SAXException, TikaException {
- try {
- Iterator<?> directories = imageMetadata.getDirectoryIterator();
- while (directories.hasNext()) {
- Directory directory = (Directory) directories.next();
- Iterator<?> tags = directory.getTagIterator();
+ try {
+ com.drew.metadata.Metadata tiffMetadata =
+ TiffMetadataReader.readMetadata(stream);
- while (tags.hasNext()) {
- Tag tag = (Tag)tags.next();
- metadata.set(tag.getTagName(), tag.getDescription());
- handleCommonImageTags(metadata, tag, directory);
- }
- handleGeoImageTags(metadata);
- }
- } catch (MetadataException e) {
- throw new TikaException("Can't read TIFF/JPEG metadata", e);
- }
+ handle(tiffMetadata);
+ } catch (TiffProcessingException e) {
+ throw new TikaException("Can't read TIFF metadata", e);
+ } catch (MetadataException e) {
+ throw new TikaException("Can't read TIFF metadata", e);
+ }
}
/**
- * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
- * Needs to be run at the end, because the GPS information
- * is spread across several EXIF tags.
+ * Copies extracted tags to tika metadata using registered handlers.
+ * @param metadataExtractor Tag directories from a Metadata Extractor "reader"
+ * @throws MetadataException This method does not handle exceptions from Metadata Extractor
*/
- public static void handleGeoImageTags(Metadata metadata) {
- String lat = metadata.get("GPS Latitude");
- String latNS = metadata.get("GPS Latitude Ref");
- if(lat != null) {
- Double latitude = parseHMS(lat);
- if(latitude != null) {
- if(latNS != null && latNS.equalsIgnoreCase("S") &&
- latitude > 0) {
- latitude *= -1;
- }
- metadata.set(Metadata.LATITUDE, LAT_LONG_FORMAT.format(latitude));
- }
- }
+ @SuppressWarnings("unchecked")
+ protected void handle(com.drew.metadata.Metadata metadataExtractor)
+ throws MetadataException {
+ handle(metadataExtractor.getDirectoryIterator());
+ }
- String lng = metadata.get("GPS Longitude");
- String lngEW = metadata.get("GPS Longitude Ref");
- if(lng != null) {
- Double longitude = parseHMS(lng);
- if(longitude != null) {
- if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
- longitude > 0) {
- longitude *= -1;
+ /**
+ * Copies extracted tags to tika metadata using registered handlers.
+ * @param directories Metadata Extractor {@link com.drew.metadata.Directory} instances.
+ * @throws MetadataException This method does not handle exceptions from Metadata Extractor
+ */
+ protected void handle(Iterator<Directory> directories) throws MetadataException {
+ while (directories.hasNext()) {
+ Directory directory = directories.next();
+ for (int i = 0; i < handlers.length; i++) {
+ if (handlers[i].supports(directory.getClass())) {
+ handlers[i].handle(directory, metadata);
}
- metadata.set(Metadata.LONGITUDE, LAT_LONG_FORMAT.format(longitude));
}
}
}
- private static Double parseHMS(String hms) {
- Matcher m = HOURS_MINUTES_SECONDS.matcher(hms);
- if(m.matches()) {
- double value =
- Integer.parseInt(m.group(1)) +
- (Integer.parseInt(m.group(2))/60.0) +
- (Double.parseDouble(m.group(3))/60.0/60.0);
- return value;
- }
- return null;
- }
- private static final Pattern HOURS_MINUTES_SECONDS = Pattern.compile("(-?\\d+)\"(\\d+)'(\\d+\\.?\\d*)");
+
/**
- * The decimal format used for expressing latitudes and longitudes.
- * The basic geo vocabulary defined by W3C (@see {@link Geographic})
- * refers to the "float" type in XML Schema as the recommended format
- * for latitude and longitude values.
+ * Reads one or more type of Metadata Extractor fields.
*/
- private static final DecimalFormat LAT_LONG_FORMAT =
- new DecimalFormat("##0.0####", new DecimalFormatSymbols(Locale.US));
+ static interface DirectoryHandler {
+ /**
+ * @param directorySubclass A Metadata Extractor directory class
+ * @return true if the directory type is supported by this handler
+ */
+ boolean supports(Class<? extends Directory> directoryType);
+ /**
+ * @param directory extracted tags
+ * @param metadata current tika metadata
+ * @throws MetadataException typically field extraction error, aborts all further extraction
+ */
+ void handle(Directory directory, Metadata metadata)
+ throws MetadataException;
+ }
/**
- * We normally won't know what timezone our dates belong to
+ * Mimics the behavior from TIKA-314 of copying all extracted tags
+ * to tika metadata using field names from Metadata Extractor.
*/
- private static final SimpleDateFormat DATE_UNSPECIFIED_TZ =
- new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
- private static synchronized void handleDate(Property property, Metadata metadata, Tag tag, Directory directory) throws MetadataException {
- Date date = directory.getDate(tag.getTagType());
- String dateString = DATE_UNSPECIFIED_TZ.format(date);
- metadata.set(property, dateString);
- }
+ static class CopyAllFieldsHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return true;
+ }
+ public void handle(Directory directory, Metadata metadata)
+ throws MetadataException {
+ Iterator<?> tags = directory.getTagIterator();
+ while (tags.hasNext()) {
+ Tag tag = (Tag) tags.next();
+ metadata.set(tag.getTagName(), tag.getDescription());
+ }
+ }
+ }
/**
- * Maps common TIFF and EXIF tags onto the Tika
- * TIFF image metadata namespace.
+ * Copies all fields regardless of directory, if the tag name
+ * is not identical to a known Metadata field name.
+ * This leads to more predictable behavior than {@link CopyAllFieldsHandler}.
*/
- public static void handleCommonImageTags(Metadata metadata, Tag tag, Directory directory) throws MetadataException {
- // Core tags
- if(tag.getTagName().equals("Date/Time") ||
- tag.getTagType() == 306) {
- handleDate(Metadata.DATE, metadata, tag, directory);
- metadata.set(Metadata.LAST_MODIFIED, metadata.get(Metadata.DATE));
- return;
- }
- if(tag.getTagName().equals("Date/Time Original") ||
- tag.getTagType() == 36867) {
- handleDate(Metadata.ORIGINAL_DATE, metadata, tag, directory);
- return;
- }
-
- if(tag.getTagName().equals("Exposure Time") ||
- tag.getTagType() == 33434) {
- Object exposure = directory.getObject(tag.getTagType());
- if(exposure instanceof Rational) {
- metadata.set(Metadata.EXPOSURE_TIME, ((Rational)exposure).doubleValue());
- } else {
- metadata.set(Metadata.EXPOSURE_TIME, tag.getDescription());
- }
- return;
- }
-
- if(tag.getTagName().equals("Flash") ||
- tag.getTagType() == 37385) {
- String flash = tag.getDescription();
- if(flash.indexOf("Flash fired") > -1) {
- metadata.set(Metadata.FLASH_FIRED, Boolean.TRUE.toString());
- }
- else if(flash.indexOf("Flash did not fire") > -1) {
- metadata.set(Metadata.FLASH_FIRED, Boolean.FALSE.toString());
- }
- else {
- metadata.set(Metadata.FLASH_FIRED, flash);
- }
- return;
- }
-
- if(tag.getTagName().equals("F-Number") ||
- tag.getTagType() == 33437) {
- Object fnumber = directory.getObject(tag.getTagType());
- if(fnumber instanceof Rational) {
- metadata.set(Metadata.F_NUMBER, ((Rational)fnumber).doubleValue());
- } else {
- metadata.set(Metadata.F_NUMBER, tag.getDescription());
- }
- return;
- }
-
- if(tag.getTagName().equals("Focal Length") ||
- tag.getTagType() == 37386) {
- Object length = directory.getObject(tag.getTagType());
- if(length instanceof Rational) {
- metadata.set(Metadata.FOCAL_LENGTH, ((Rational)length).doubleValue());
- } else {
- metadata.set(Metadata.FOCAL_LENGTH, tag.getDescription());
- }
- return;
+ static class CopyUnknownFieldsHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return true;
+ }
+ public void handle(Directory directory, Metadata metadata)
+ throws MetadataException {
+ Iterator<?> tags = directory.getTagIterator();
+ while (tags.hasNext()) {
+ Tag tag = (Tag) tags.next();
+ String name = tag.getTagName();
+ if (!MetadataFields.isMetadataField(name)) {
+ metadata.set(name, tag.getDescription());
+ }
+ }
}
-
- if(tag.getTagName().equals("ISO Speed Ratings") ||
- tag.getTagType() == 34855) {
- metadata.set(Metadata.ISO_SPEED_RATINGS, tag.getDescription());
- return;
- }
-
- if(tag.getTagName().equals("Make") ||
- tag.getTagType() == 271) {
- metadata.set(Metadata.EQUIPMENT_MAKE, tag.getDescription());
- return;
- }
- if(tag.getTagName().equals("Model") ||
- tag.getTagType() == 272) {
- metadata.set(Metadata.EQUIPMENT_MODEL, tag.getDescription());
- return;
- }
-
- if(tag.getTagName().equals("Orientation") ||
- tag.getTagType() == 274) {
- Object length = directory.getObject(tag.getTagType());
- if(length instanceof Integer) {
- metadata.set(Metadata.ORIENTATION, Integer.toString( ((Integer)length).intValue() ));
- } else {
- metadata.set(Metadata.ORIENTATION, tag.getDescription());
- }
- return;
+ }
+
+ /**
+ * Basic image properties for TIFF and JPEG, at least.
+ */
+ static class DimensionsHandler implements DirectoryHandler {
+ private final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == JpegDirectory.class || directoryType == ExifDirectory.class;
+ }
+ public void handle(Directory directory, Metadata metadata) throws MetadataException {
+ // The test TIFF has width and height stored as follows according to exiv2
+ //Exif.Image.ImageWidth Short 1 100
+ //Exif.Image.ImageLength Short 1 75
+ // and the values are found in "Thumbnail Image Width" (and Height) from Metadata Extractor
+ set(directory, metadata, ExifDirectory.TAG_THUMBNAIL_IMAGE_WIDTH, Metadata.IMAGE_WIDTH);
+ set(directory, metadata, JpegDirectory.TAG_JPEG_IMAGE_WIDTH, Metadata.IMAGE_WIDTH);
+ set(directory, metadata, ExifDirectory.TAG_THUMBNAIL_IMAGE_HEIGHT, Metadata.IMAGE_LENGTH);
+ set(directory, metadata, JpegDirectory.TAG_JPEG_IMAGE_HEIGHT, Metadata.IMAGE_LENGTH);
+ // Bits per sample, two methods of extracting, exif overrides jpeg
+ set(directory, metadata, JpegDirectory.TAG_JPEG_DATA_PRECISION, Metadata.BITS_PER_SAMPLE);
+ set(directory, metadata, ExifDirectory.TAG_BITS_PER_SAMPLE, Metadata.BITS_PER_SAMPLE);
+ // Straightforward
+ set(directory, metadata, ExifDirectory.TAG_SAMPLES_PER_PIXEL, Metadata.SAMPLES_PER_PIXEL);
+ }
+ private void set(Directory directory, Metadata metadata, int extractTag, Property metadataField) {
+ if (directory.containsTag(extractTag)) {
+ Matcher m = LEADING_NUMBERS.matcher(directory.getString(extractTag));
+ if(m.matches()) {
+ metadata.set(metadataField, m.group(1));
+ }
+ }
}
-
- if(tag.getTagName().equals("Software") ||
- tag.getTagType() == 305) {
- metadata.set(Metadata.SOFTWARE, tag.getDescription());
- return;
- }
-
- if(tag.getTagName().equals("X Resolution") ||
- tag.getTagType() == 282) {
- Object resolution = directory.getObject(tag.getTagType());
- if(resolution instanceof Rational) {
- metadata.set(Metadata.RESOLUTION_HORIZONTAL, ((Rational)resolution).doubleValue());
- } else {
- metadata.set(Metadata.RESOLUTION_HORIZONTAL, tag.getDescription());
- }
- return;
+ }
+
+ static class JpegCommentHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == JpegCommentDirectory.class;
+ }
+ public void handle(Directory directory, Metadata metadata) throws MetadataException {
+ if (directory.containsTag(JpegCommentDirectory.TAG_JPEG_COMMENT)) {
+ metadata.add(Metadata.COMMENT, directory.getString(JpegCommentDirectory.TAG_JPEG_COMMENT));
+ }
}
- if(tag.getTagName().equals("Y Resolution") ||
- tag.getTagType() == 283) {
- Object resolution = directory.getObject(tag.getTagType());
- if(resolution instanceof Rational) {
- metadata.set(Metadata.RESOLUTION_VERTICAL, ((Rational)resolution).doubleValue());
- } else {
- metadata.set(Metadata.RESOLUTION_VERTICAL, tag.getDescription());
- }
- return;
+ }
+
+ static class ExifHandler implements DirectoryHandler {
+ private static final SimpleDateFormat DATE_UNSPECIFIED_TZ = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == ExifDirectory.class;
+ }
+ public void handle(Directory directory, Metadata metadata) {
+ try {
+ handleDateTags(directory, metadata);
+ handlePhotoTags(directory, metadata);
+ handleCommentTags(directory, metadata);
+ } catch (MetadataException e) {
+ // ignore date parse errors and proceed with other tags
+ }
}
- if(tag.getTagName().equals("Resolution Unit") ||
- tag.getTagType() == 296) {
- metadata.set(Metadata.RESOLUTION_UNIT, tag.getDescription());
- return;
+ /**
+ * EXIF may contain image description, although with undefined encoding.
+ * Use IPTC for other annotation fields, and XMP for unicode support.
+ */
+ public void handleCommentTags(Directory directory, Metadata metadata) {
+ if (metadata.get(Metadata.DESCRIPTION) == null &&
+ directory.containsTag(ExifDirectory.TAG_IMAGE_DESCRIPTION)) {
+ metadata.set(Metadata.DESCRIPTION, directory.getString(ExifDirectory.TAG_IMAGE_DESCRIPTION));
+ }
}
+ /**
+ * Maps common TIFF and EXIF tags onto the Tika
+ * TIFF image metadata namespace.
+ */
+ public void handlePhotoTags(Directory directory, Metadata metadata) {
+ if(directory.containsTag(ExifDirectory.TAG_EXPOSURE_TIME)) {
+ Object exposure = directory.getObject(ExifDirectory.TAG_EXPOSURE_TIME);
+ if(exposure instanceof Rational) {
+ metadata.set(Metadata.EXPOSURE_TIME, ((Rational)exposure).doubleValue());
+ } else {
+ metadata.set(Metadata.EXPOSURE_TIME, directory.getString(ExifDirectory.TAG_EXPOSURE_TIME));
+ }
+ }
+
+ if(directory.containsTag(ExifDirectory.TAG_FLASH)) {
+ String flash = "";
+ try {
+ flash = directory.getDescription(ExifDirectory.TAG_FLASH);
+ } catch (MetadataException e) {
+ // ignore
+ }
+ if(flash.indexOf("Flash fired") > -1) {
+ metadata.set(Metadata.FLASH_FIRED, Boolean.TRUE.toString());
+ }
+ else if(flash.indexOf("Flash did not fire") > -1) {
+ metadata.set(Metadata.FLASH_FIRED, Boolean.FALSE.toString());
+ }
+ else {
+ metadata.set(Metadata.FLASH_FIRED, flash);
+ }
+ }
- if(tag.getTagName().equals("Keywords") ||
- tag.getTagType() == 537) {
- metadata.set(Metadata.KEYWORDS, tag.getDescription());
- return;
+ if(directory.containsTag(ExifDirectory.TAG_FNUMBER)) {
+ Object fnumber = directory.getObject(ExifDirectory.TAG_FNUMBER);
+ if(fnumber instanceof Rational) {
+ metadata.set(Metadata.F_NUMBER, ((Rational)fnumber).doubleValue());
+ } else {
+ metadata.set(Metadata.F_NUMBER, directory.getString(ExifDirectory.TAG_FNUMBER));
+ }
+ }
+
+ if(directory.containsTag(ExifDirectory.TAG_FOCAL_LENGTH)) {
+ Object length = directory.getObject(ExifDirectory.TAG_FOCAL_LENGTH);
+ if(length instanceof Rational) {
+ metadata.set(Metadata.FOCAL_LENGTH, ((Rational)length).doubleValue());
+ } else {
+ metadata.set(Metadata.FOCAL_LENGTH, directory.getString(ExifDirectory.TAG_FOCAL_LENGTH));
+ }
+ }
+
+ if(directory.containsTag(ExifDirectory.TAG_ISO_EQUIVALENT)) {
+ metadata.set(Metadata.ISO_SPEED_RATINGS, directory.getString(ExifDirectory.TAG_ISO_EQUIVALENT));
+ }
+
+ if(directory.containsTag(ExifDirectory.TAG_MAKE)) {
+ metadata.set(Metadata.EQUIPMENT_MAKE, directory.getString(ExifDirectory.TAG_MAKE));
+ }
+ if(directory.containsTag(ExifDirectory.TAG_MODEL)) {
+ metadata.set(Metadata.EQUIPMENT_MODEL, directory.getString(ExifDirectory.TAG_MODEL));
+ }
+
+ if(directory.containsTag(ExifDirectory.TAG_ORIENTATION)) {
+ Object length = directory.getObject(ExifDirectory.TAG_ORIENTATION);
+ if(length instanceof Integer) {
+ metadata.set(Metadata.ORIENTATION, Integer.toString( ((Integer)length).intValue() ));
+ } else {
+ metadata.set(Metadata.ORIENTATION, directory.getString(ExifDirectory.TAG_ORIENTATION));
+ }
+ }
+
+ if(directory.containsTag(ExifDirectory.TAG_SOFTWARE)) {
+ metadata.set(Metadata.SOFTWARE, directory.getString(ExifDirectory.TAG_SOFTWARE));
+ }
+
+ if(directory.containsTag(ExifDirectory.TAG_X_RESOLUTION)) {
+ Object resolution = directory.getObject(ExifDirectory.TAG_X_RESOLUTION);
+ if(resolution instanceof Rational) {
+ metadata.set(Metadata.RESOLUTION_HORIZONTAL, ((Rational)resolution).doubleValue());
+ } else {
+ metadata.set(Metadata.RESOLUTION_HORIZONTAL, directory.getString(ExifDirectory.TAG_X_RESOLUTION));
+ }
+ }
+ if(directory.containsTag(ExifDirectory.TAG_Y_RESOLUTION)) {
+ Object resolution = directory.getObject(ExifDirectory.TAG_Y_RESOLUTION);
+ if(resolution instanceof Rational) {
+ metadata.set(Metadata.RESOLUTION_VERTICAL, ((Rational)resolution).doubleValue());
+ } else {
+ metadata.set(Metadata.RESOLUTION_VERTICAL, directory.getString(ExifDirectory.TAG_Y_RESOLUTION));
+ }
+ }
+ if(directory.containsTag(ExifDirectory.TAG_RESOLUTION_UNIT)) {
+ try {
+ metadata.set(Metadata.RESOLUTION_UNIT, directory.getDescription(ExifDirectory.TAG_RESOLUTION_UNIT));
+ } catch (MetadataException e) {
+ // ignore
+ }
+ }
}
-
- if(tag.getTagName().equals("Jpeg Comment")) {
- metadata.set(Metadata.COMMENTS, tag.getDescription());
- return;
+ /**
+ * Maps exif dates to metadata fields.
+ */
+ public void handleDateTags(Directory directory, Metadata metadata)
+ throws MetadataException {
+ // Date/Time Original overrides value from ExifDirectory.TAG_DATETIME
+ Date original = null;
+ if (directory.containsTag(ExifDirectory.TAG_DATETIME_ORIGINAL)) {
+ original = directory.getDate(ExifDirectory.TAG_DATETIME_ORIGINAL);
+ // Unless we have GPS time we don't know the time zone so date must be set
+ // as ISO 8601 datetime without timezone suffix (no Z or +/-)
+ String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.format(original); // Same time zone as Metadata Extractor uses
+ metadata.set(Metadata.DATE, datetimeNoTimeZone);
+ metadata.set(Metadata.ORIGINAL_DATE, datetimeNoTimeZone);
+ }
+ if (directory.containsTag(ExifDirectory.TAG_DATETIME)) {
+ Date datetime = directory.getDate(ExifDirectory.TAG_DATETIME);
+ String datetimeNoTimeZone = DATE_UNSPECIFIED_TZ.format(datetime);
+ metadata.set(Metadata.LAST_MODIFIED, datetimeNoTimeZone);
+ // If Date/Time Original does not exist this might be creation date
+ if (original == null) {
+ metadata.set(Metadata.DATE, datetimeNoTimeZone);
+ }
+ }
}
-// System.err.println(directory.getObject(tag.getTagType()) + " " + directory.getObject(tag.getTagType()).getClass());
-
- // File info
- // Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
- if("Iptc".equals(tag.getDirectoryName())) {
- if("Object Name".equals(tag.getTagName())) {
- metadata.set(Metadata.TITLE, tag.getDescription());
- return;
+ }
+
+ /**
+ * Reads image comments, originally TIKA-472.
+ * Metadata Extractor does not read XMP so we need to use the values from Iptc or EXIF
+ */
+ static class IptcHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == IptcDirectory.class;
+ }
+ public void handle(Directory directory, Metadata metadata)
+ throws MetadataException {
+ if (directory.containsTag(IptcDirectory.TAG_KEYWORDS)) {
+ String[] keywords = directory.getStringArray(IptcDirectory.TAG_KEYWORDS);
+ for (String k : keywords) {
+ metadata.add(Metadata.KEYWORDS, k);
+ }
+ }
+ if (directory.containsTag(IptcDirectory.TAG_HEADLINE)) {
+ metadata.set(Metadata.TITLE, directory.getString(IptcDirectory.TAG_HEADLINE));
+ } else if (directory.containsTag(IptcDirectory.TAG_OBJECT_NAME)) {
+ metadata.set(Metadata.TITLE, directory.getString(IptcDirectory.TAG_OBJECT_NAME));
}
- if("By-line".equals(tag.getTagName())) {
- metadata.set(Metadata.AUTHOR, tag.getDescription());
- return;
- }
- if("Caption/Abstract".equals(tag.getTagName())) {
- // Looks like metadata extractor returns IPTC newlines as a single carriage return,
- // but the exiv2 command does not so we change to line feed here because that is less surprising to users
- metadata.set(Metadata.DESCRIPTION, tag.getDescription().replaceAll("\r\n?", "\n"));
- return;
+ if (directory.containsTag(IptcDirectory.TAG_BY_LINE)) {
+ metadata.set(Metadata.AUTHOR, directory.getString(IptcDirectory.TAG_BY_LINE));
+ }
+ if (directory.containsTag(IptcDirectory.TAG_CAPTION)) {
+ metadata.set(Metadata.DESCRIPTION,
+ // Looks like metadata extractor returns IPTC newlines as a single carriage return,
+ // but the exiv2 command does not so we change to line feed here because that is less surprising to users
+ directory.getString(IptcDirectory.TAG_CAPTION).replaceAll("\r\n?", "\n"));
}
}
+ }
- // EXIF / TIFF Tags
- Property key = null;
- if(tag.getTagName().equals("Image Width") ||
- tag.getTagType() == 256) {
- key = Metadata.IMAGE_WIDTH;
- }
- if(tag.getTagName().equals("Image Height") ||
- tag.getTagType() == 257) {
- key = Metadata.IMAGE_LENGTH;
- }
- if(tag.getTagName().equals("Data Precision") ||
- tag.getTagName().equals("Bits Per Sample") ||
- tag.getTagType() == 258) {
- key = Metadata.BITS_PER_SAMPLE;
- }
- if(tag.getTagType() == 277) {
- key = Metadata.SAMPLES_PER_PIXEL;
- }
+ /**
+ * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
+ */
+ static class GeotagHandler implements DirectoryHandler {
+ public boolean supports(Class<? extends Directory> directoryType) {
+ return directoryType == GpsDirectory.class;
+ }
+ public void handle(Directory directory, Metadata metadata) throws MetadataException {
+ String lat = directory.getDescription(GpsDirectory.TAG_GPS_LATITUDE);
+ String latNS = directory.getDescription(GpsDirectory.TAG_GPS_LATITUDE_REF);
+ if(lat != null) {
+ Double latitude = parseHMS(lat);
+ if(latitude != null) {
+ if(latNS != null && latNS.equalsIgnoreCase("S") &&
+ latitude > 0) {
+ latitude *= -1;
+ }
+ metadata.set(Metadata.LATITUDE, LAT_LONG_FORMAT.format(latitude));
+ }
+ }
- if(key != null) {
- Matcher m = LEADING_NUMBERS.matcher(tag.getDescription());
- if(m.matches()) {
- metadata.set(key, m.group(1));
+ String lng = directory.getDescription(GpsDirectory.TAG_GPS_LONGITUDE);
+ String lngEW = directory.getDescription(GpsDirectory.TAG_GPS_LONGITUDE_REF);
+ if(lng != null) {
+ Double longitude = parseHMS(lng);
+ if(longitude != null) {
+ if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
+ longitude > 0) {
+ longitude *= -1;
+ }
+ metadata.set(Metadata.LONGITUDE, LAT_LONG_FORMAT.format(longitude));
+ }
}
}
+ private Double parseHMS(String hms) {
+ Matcher m = HOURS_MINUTES_SECONDS.matcher(hms);
+ if(m.matches()) {
+ double value =
+ Integer.parseInt(m.group(1)) +
+ (Integer.parseInt(m.group(2))/60.0) +
+ (Double.parseDouble(m.group(3))/60.0/60.0);
+ return value;
+ }
+ return null;
+ }
+ private static final Pattern HOURS_MINUTES_SECONDS = Pattern.compile("(-?\\d+)\"(\\d+)'(\\d+\\.?\\d*)");
+ /**
+ * The decimal format used for expressing latitudes and longitudes.
+ * The basic geo vocabulary defined by W3C (@see {@link Geographic})
+ * refers to the "float" type in XML Schema as the recommended format
+ * for latitude and longitude values.
+ */
+ private static final DecimalFormat LAT_LONG_FORMAT =
+ new DecimalFormat("##0.0####", new DecimalFormatSymbols(Locale.US));
}
- private static final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
+
}
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java?rev=1033546&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/MetadataFields.java Wed Nov 10 15:57:34 2010
@@ -0,0 +1,58 @@
+package org.apache.tika.parser.image;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashSet;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+
+/**
+ * Knowns about all declared {@link Metadata} fields.
+ * Didn't find this functionality anywhere so it was added for
+ * ImageMetadataExtractor, but it can be generalized.
+ */
+public abstract class MetadataFields {
+
+ private static HashSet<String> known;
+
+ static {
+ known = new HashSet<String>();
+ Field[] fields = Metadata.class.getFields();
+ for (Field f : fields) {
+ int mod = f.getModifiers();
+ if (Modifier.isPublic(mod) && Modifier.isStatic(mod) && Modifier.isFinal(mod)) {
+ Class<?> c = f.getType();
+ if (String.class.equals(c)) {
+ try {
+ String p = (String) f.get(null);
+ if (p != null) {
+ known.add(p);
+ }
+ } catch (IllegalArgumentException e) {
+ e.printStackTrace();
+ } catch (IllegalAccessException e) {
+ e.printStackTrace();
+ }
+ }
+ if (Property.class.isAssignableFrom(c)) {
+ try {
+ Property p = (Property) f.get(null);
+ if (p != null) {
+ known.add(p.getName());
+ }
+ } catch (IllegalArgumentException e) {
+ e.printStackTrace();
+ } catch (IllegalAccessException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+ }
+
+ public static boolean isMetadataField(String name) {
+ return known.contains(name);
+ }
+
+}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffParser.java Wed Nov 10 15:57:34 2010
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.image;
+import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
@@ -26,6 +27,7 @@ import org.apache.tika.metadata.Metadata
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -52,13 +54,18 @@ public class TiffParser implements Parse
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- new ImageMetadataExtractor(metadata).parseTiff(stream);
- for (String s : metadata.names()) {
- if (s.startsWith("Unknown tag")) {
- metadata.remove(s);
+ // read stream twice - exif and xmp extractors
+ stream.mark(Integer.MAX_VALUE);
+ FilterInputStream first = new FilterInputStream(stream) {
+ @Override
+ public void close() throws IOException {
}
- }
+ };
+ new ImageMetadataExtractor(metadata).parseTiff(first);
+ stream.reset();
+
+ new JempboxExtractor(metadata).parse(stream);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegParser.java Wed Nov 10 15:57:34 2010
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.jpeg;
+import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
@@ -27,6 +28,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.image.ImageMetadataExtractor;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -53,7 +55,18 @@ public class JpegParser implements Parse
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- new ImageMetadataExtractor(metadata).parseJpeg(stream);
+
+ // read stream twice - exif and xmp extractors
+ stream.mark(Integer.MAX_VALUE);
+ FilterInputStream first = new FilterInputStream(stream) {
+ @Override
+ public void close() throws IOException {
+ }
+ };
+ new ImageMetadataExtractor(metadata).parseJpeg(first);
+ stream.reset();
+
+ new JempboxExtractor(metadata).parse(stream);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java?rev=1033546&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageMetadataExtractorTest.java Wed Nov 10 15:57:34 2010
@@ -0,0 +1,99 @@
+package org.apache.tika.parser.image;
+
+import java.util.Arrays;
+import java.util.GregorianCalendar;
+import java.util.Iterator;
+
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.Metadata;
+
+import com.drew.metadata.Directory;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+import com.drew.metadata.exif.ExifDirectory;
+import com.drew.metadata.jpeg.JpegCommentDirectory;
+
+import junit.framework.TestCase;
+
+import static org.mockito.Mockito.*;
+
+public class ImageMetadataExtractorTest extends TestCase {
+
+ @SuppressWarnings({ "rawtypes", "unchecked" })
+ public void testHandleDirectories() throws MetadataException {
+ Metadata metadata = mock(Metadata.class);
+ ImageMetadataExtractor.DirectoryHandler handler1 = mock(ImageMetadataExtractor.DirectoryHandler.class);
+ ImageMetadataExtractor e = new ImageMetadataExtractor(metadata, handler1);
+
+ Directory directory = new JpegCommentDirectory();
+ Iterator directories = mock(Iterator.class);
+ when(directories.hasNext()).thenReturn(true, false);
+ when(directories.next()).thenReturn(directory);
+ when(handler1.supports(JpegCommentDirectory.class)).thenReturn(true);
+
+ e.handle(directories);
+ verify(handler1).supports(JpegCommentDirectory.class);
+ verify(handler1).handle(directory, metadata);
+ }
+
+ public void testExifHandlerSupports() {
+ assertTrue(new ImageMetadataExtractor.ExifHandler().supports(ExifDirectory.class));
+ assertFalse(new ImageMetadataExtractor.ExifHandler().supports(Directory.class));
+ assertFalse(new ImageMetadataExtractor.ExifHandler().supports(JpegCommentDirectory.class));
+ }
+
+ public void testExifHandlerParseDate() throws MetadataException {
+ ExifDirectory exif = mock(ExifDirectory.class);
+ when(exif.containsTag(ExifDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
+ when(exif.getDate(ExifDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(
+ new GregorianCalendar(2000, 0, 1, 0, 0, 0).getTime()); // jvm default timezone as in Metadata Extractor
+ Metadata metadata = new Metadata();
+
+ new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+ assertEquals("Should be ISO date without time zone", "2000-01-01T00:00:00", metadata.get(DublinCore.DATE));
+ }
+
+ public void testExifHandlerParseDateFallback() throws MetadataException {
+ ExifDirectory exif = mock(ExifDirectory.class);
+ when(exif.containsTag(ExifDirectory.TAG_DATETIME)).thenReturn(true);
+ when(exif.getDate(ExifDirectory.TAG_DATETIME)).thenReturn(
+ new GregorianCalendar(1999, 0, 1, 0, 0, 0).getTime()); // jvm default timezone as in Metadata Extractor
+ Metadata metadata = new Metadata();
+
+ new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+ assertEquals("Should try EXIF Date/Time if Original is not set", "1999-01-01T00:00:00", metadata.get(DublinCore.DATE));
+ }
+
+ public void testExifHandlerParseDateError() throws MetadataException {
+ ExifDirectory exif = mock(ExifDirectory.class);
+ when(exif.containsTag(ExifDirectory.TAG_DATETIME_ORIGINAL)).thenReturn(true);
+ when(exif.getDate(ExifDirectory.TAG_DATETIME_ORIGINAL)).thenThrow(
+ new MetadataException("Tag 'X' cannot be cast to a java.util.Date."));
+ Metadata metadata = new Metadata();
+
+ new ImageMetadataExtractor.ExifHandler().handle(exif, metadata);
+ assertEquals("Parsing should proceed without date", null, metadata.get(DublinCore.DATE));
+ }
+
+ public void testCopyUnknownFieldsHandler() throws MetadataException {
+ Directory d = mock(Directory.class);
+ Tag t1 = mock(Tag.class);
+ when(t1.getTagName()).thenReturn("Image Description");
+ when(t1.getDescription()).thenReturn("t1");
+ Tag t2 = mock(Tag.class);
+ when(t2.getTagName()).thenReturn(Metadata.KEYWORDS);
+ when(t2.getDescription()).thenReturn("known");
+ Tag t3 = mock(Tag.class);
+ when(t3.getTagName()).thenReturn(Metadata.DESCRIPTION);
+ when(t3.getDescription()).thenReturn("known");
+ Iterator<Tag> tags = Arrays.asList(t1, t2, t3).iterator();
+ when(d.getTagIterator()).thenReturn(tags);
+ Metadata metadata = new Metadata();
+ new ImageMetadataExtractor.CopyUnknownFieldsHandler().handle(d, metadata);
+ assertEquals("t1", metadata.get("Image Description"));
+ assertNull("keywords should be excluded from bulk copy because it is a defined field",
+ metadata.get(Metadata.KEYWORDS));
+ assertNull(metadata.get(Metadata.DESCRIPTION));
+ }
+
+}
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java?rev=1033546&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/MetadataFieldsTest.java Wed Nov 10 15:57:34 2010
@@ -0,0 +1,17 @@
+package org.apache.tika.parser.image;
+
+import org.apache.tika.metadata.DublinCore;
+import org.apache.tika.metadata.TIFF;
+
+import junit.framework.TestCase;
+
+public class MetadataFieldsTest extends TestCase {
+
+ public void testIsMetadataField() {
+ assertFalse(MetadataFields.isMetadataField("random string that is not a field"));
+ assertFalse(MetadataFields.isMetadataField("xyz"));
+ assertTrue(MetadataFields.isMetadataField(DublinCore.SUBJECT));
+ assertTrue(MetadataFields.isMetadataField(TIFF.F_NUMBER.getName()));
+ }
+
+}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java Wed Nov 10 15:57:34 2010
@@ -23,6 +23,8 @@ import org.apache.tika.metadata.Metadata
import org.xml.sax.helpers.DefaultHandler;
import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
public class TiffParserTest extends TestCase {
private final Parser parser = new TiffParser();
@@ -37,15 +39,20 @@ public class TiffParserTest extends Test
assertEquals("Licensed to the Apache Software Foundation (ASF) under one or " +
"more contributor license agreements. See the NOTICE file " +
"distributed with this work for additional information regarding " +
- "copyright ownership.", metadata.get("Image Description"));
+ "copyright ownership.", metadata.get(Metadata.DESCRIPTION));
// All EXIF/TIFF tags
- assertEquals("Inch", metadata.get("Resolution Unit"));
+ assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
// Core EXIF/TIFF tags
assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL));
+
+ // Embedded XMP
+ List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+ assertTrue("got " + subject, subject.contains("cat"));
+ assertTrue("got " + subject, subject.contains("garden"));
}
}
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java?rev=1033546&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/xmp/JempboxExtractorTest.java Wed Nov 10 15:57:34 2010
@@ -0,0 +1,79 @@
+package org.apache.tika.parser.image.xmp;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.image.xmp.JempboxExtractor;
+
+import junit.framework.TestCase;
+
+public class JempboxExtractorTest extends TestCase {
+
+ public void testParseJpeg() throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented.jpg");
+ // set some values before extraction to see that they are overridden
+ metadata.set(Metadata.TITLE, "old title");
+ metadata.set(Metadata.DESCRIPTION, "old description");
+ metadata.set(Metadata.CREATOR, "previous author");
+ // ... or kept in case the field is multi-value
+ metadata.add(Metadata.SUBJECT, "oldkeyword");
+
+ JempboxExtractor extractor = new JempboxExtractor(metadata);
+ extractor.parse(stream);
+
+ // DublinCore fields
+ assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+ assertEquals("Some Tourist", metadata.get(Metadata.CREATOR));
+ Collection<String> keywords = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+ assertTrue(keywords.contains("oldkeyword"));
+ assertTrue(keywords.contains("grazelands"));
+ assertTrue(keywords.contains("nature reserve"));
+ assertTrue(keywords.contains("bird watching"));
+ assertTrue(keywords.contains("coast"));
+ }
+
+ public void testParseJpegPhotoshop() throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
+
+ JempboxExtractor extractor = new JempboxExtractor(metadata);
+ extractor.parse(stream);
+
+ // DublinCore fields
+ assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+ assertEquals("Some Tourist", metadata.get(Metadata.CREATOR));
+ Collection<String> keywords = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+ assertTrue(keywords.contains("bird watching"));
+ assertTrue(keywords.contains("coast"));
+ }
+
+ public void testParseJpegXnviewmp() throws IOException, TikaException {
+ Metadata metadata = new Metadata();
+ InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
+
+ JempboxExtractor extractor = new JempboxExtractor(metadata);
+ extractor.parse(stream);
+
+ // XnViewMp fields not understood by Jempbox
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+ Collection<String> keywords = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+ assertTrue(keywords.contains("coast"));
+ assertTrue(keywords.contains("nature reserve"));
+ }
+
+ public void testJoinCreators() {
+ assertEquals("Mr B", new JempboxExtractor(null).joinCreators(
+ Arrays.asList("Mr B")));
+ // TODO use multi-value property instead?
+ assertEquals("Mr B, Mr A", new JempboxExtractor(null).joinCreators(
+ Arrays.asList("Mr B", "Mr A")));
+ }
+
+}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Wed Nov 10 15:57:34 2010
@@ -16,15 +16,18 @@
*/
package org.apache.tika.parser.jpeg;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+
import junit.framework.TestCase;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TIFF;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.metadata.Metadata;
import org.xml.sax.helpers.DefaultHandler;
-import java.io.InputStream;
-
public class JpegParserTest extends TestCase {
private final Parser parser = new JpegParser();
@@ -35,9 +38,6 @@ public class JpegParserTest extends Test
getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
- // All EXIF/TIFF tags
- assertEquals("Canon EOS 40D", metadata.get("Model"));
-
// Core EXIF/TIFF tags
assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
@@ -57,12 +57,20 @@ public class JpegParserTest extends Test
assertEquals("240.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+ // Check that EXIF/TIFF tags come through with their raw values too
+ // (This may be removed for Tika 1.0, as we support more of them
+ // with explicit Metadata entries)
+ assertEquals("Canon EOS 40D", metadata.get("Model"));
+
// Common tags
- assertEquals("Date/Time for when the photo was taken, unspecified time zone",
- "2009-10-02T23:02:49", metadata.get(Metadata.DATE));
+ //assertEquals("2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
- "2009-08-11T09:09:45", metadata.get(Metadata.ORIGINAL_DATE));
- assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
+ "2009-08-11T09:09:45", metadata.get(Metadata.DATE));
+ List<String> keywords = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+ assertTrue("'canon-55-250' expected in " + keywords, keywords.contains("canon-55-250"));
+ assertTrue("'moscow-birds' expected in " + keywords, keywords.contains("moscow-birds"));
+ assertTrue("'serbor' expected in " + keywords, keywords.contains("serbor"));
+ assertFalse(keywords.contains("canon-55-250 moscow-birds serbor"));
}
public void testJPEGGeo() throws Exception {
@@ -75,9 +83,6 @@ public class JpegParserTest extends Test
// Geo tags
assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
-
- // All EXIF/TIFF tags
- assertEquals("Canon EOS 40D", metadata.get("Model"));
// Core EXIF/TIFF tags
assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
@@ -100,10 +105,12 @@ public class JpegParserTest extends Test
// Common tags
assertEquals("Date/Time Original for when the photo was taken, unspecified time zone",
- "2009-08-11T09:09:45", metadata.get(Metadata.ORIGINAL_DATE));
+ "2009-08-11T09:09:45", metadata.get(Metadata.DATE));
assertEquals("This image has different Date/Time than Date/Time Original, so it is probably modification date",
- "2009-10-02T23:02:49", metadata.get(Metadata.DATE));
- assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
+ "2009-10-02T23:02:49", metadata.get(Metadata.LAST_MODIFIED));
+ assertEquals("Date/Time Original should be stored in EXIF field too",
+ "2009-08-11T09:09:45", metadata.get(TIFF.ORIGINAL_DATE));
+ assertEquals("canon-55-250", metadata.getValues(Metadata.KEYWORDS)[0]);
}
public void testJPEGTitleAndDescription() throws Exception {
@@ -114,13 +121,17 @@ public class JpegParserTest extends Test
parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
// embedded comments with non-ascii characters
- //assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
- assertEquals("Tosteberga " + new String(new byte[]{-61, -124}) + "ngar", metadata.get(Metadata.TITLE));
- //assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
- assertEquals("Bird site in north eastern Sk" + new String(new byte[]{-61, -91}) +
- "ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+ assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
assertEquals("Some Tourist", metadata.get(Metadata.AUTHOR));
- assertEquals("grazelands nature reserve bird watching coast", metadata.get(Metadata.KEYWORDS));
+ assertEquals("Some Tourist", metadata.get(Metadata.CREATOR)); // Dublin Core
+ // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
+ // but we have to replace them with underscore
+
+ List<String> keywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
+ assertTrue(keywords.contains("coast"));
+ assertTrue(keywords.contains("bird watching"));
+ assertEquals(keywords, Arrays.asList(metadata.getValues(Metadata.SUBJECT)));
// Core EXIF/TIFF tags
assertEquals("103", metadata.get(Metadata.IMAGE_WIDTH));
@@ -138,6 +149,39 @@ public class JpegParserTest extends Test
assertEquals("1", metadata.get(Metadata.ORIENTATION)); // Not present
assertEquals("300.0", metadata.get(Metadata.RESOLUTION_HORIZONTAL));
assertEquals("300.0", metadata.get(Metadata.RESOLUTION_VERTICAL));
- assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+ assertEquals("Inch", metadata.get(Metadata.RESOLUTION_UNIT));
+ }
+
+ public void testJPEGTitleAndDescriptionPhotoshop() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_commented_pspcs2mac.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ // embedded comments with non-ascii characters
+ assertEquals("Tosteberga \u00C4ngar", metadata.get(Metadata.TITLE));
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+ assertEquals("Some Tourist", metadata.get(Metadata.CREATOR));
+ List<String> subject = Arrays.asList(metadata.getValues(Metadata.SUBJECT));
+ assertTrue("got " + subject, subject.contains("bird watching"));
+ }
+
+ public void testJPEGTitleAndDescriptionXnviewmp() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_commented_xnviewmp026.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ // XnViewMp's default comment dialog has only comment, not headline.
+ // Comment is embedded only if "Write comments in XMP" is enabled in settings
+ assertEquals("Bird site in north eastern Sk\u00E5ne, Sweden.\n(new line)", metadata.get(Metadata.DESCRIPTION));
+ // xmp handles spaces in keywords, returns "bird watching, nature reserve, coast, grazelands"
+ // but we have to replace them with underscore
+ String[] subject = metadata.getValues(Metadata.SUBJECT);
+ List<String> keywords = Arrays.asList(subject);
+ assertTrue("'coast'" + " not in " + keywords, keywords.contains("coast"));
+ assertTrue("'nature reserve'" + " not in " + keywords, keywords.contains("nature reserve"));
}
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_pspcs2mac.jpg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_pspcs2mac.jpg?rev=1033546&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_pspcs2mac.jpg
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg?rev=1033546&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_commented_xnviewmp026.jpg
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testTIFF.tif
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testTIFF.tif?rev=1033546&r1=1033545&r2=1033546&view=diff
==============================================================================
Binary files - no diff available.