You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/06/28 15:59:09 UTC
svn commit: r958581 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/metadata/
tika-parsers/src/main/java/org/apache/tika/parser/image/
tika-parsers/src/main/java/org/apache/tika/parser/jpeg/
tika-parsers/src/test/java/org/apache/tika/parser/i...
Author: nick
Date: Mon Jun 28 13:59:08 2010
New Revision: 958581
URL: http://svn.apache.org/viewvc?rev=958581&view=rev
Log:
Use the new TIFF Metadata entries for image width/length/sampling from the TIFF, JPEG and general Image (ImageIO) parsers. Gives a small number of consistent image related metadata entries across all formats. (TIKA-442)
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java Mon Jun 28 13:59:08 2010
@@ -25,7 +25,7 @@ import java.util.Properties;
* A multi-valued metadata container.
*/
public class Metadata implements CreativeCommons, DublinCore, HttpHeaders,
- Message, MSOffice, ClimateForcast, TikaMetadataKeys, TikaMimeKeys {
+ Message, MSOffice, ClimateForcast, TIFF, TikaMetadataKeys, TikaMimeKeys {
/**
* A map of all metadata attributes.
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/ImageParser.java Mon Jun 28 13:59:08 2010
@@ -32,6 +32,7 @@ import javax.imageio.metadata.IIOMetadat
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -70,6 +71,9 @@ public class ImageParser implements Pars
ImageReader reader = iterator.next();
reader.setInput(ImageIO.createImageInputStream(
new CloseShieldInputStream(stream)));
+
+ metadata.set(Metadata.IMAGE_WIDTH, Integer.toString(reader.getWidth(0)));
+ metadata.set(Metadata.IMAGE_LENGTH, Integer.toString(reader.getHeight(0)));
metadata.set("height", Integer.toString(reader.getHeight(0)));
metadata.set("width", Integer.toString(reader.getWidth(0)));
@@ -77,6 +81,12 @@ public class ImageParser implements Pars
reader.dispose();
}
+
+ // Translate certain Metadata tags from the ImageIO
+ // specific namespace into the general Tika one
+ setIfPresent(metadata, "CommentExtensions CommentExtension", Metadata.COMMENTS);
+ setIfPresent(metadata, "markerSequence com", Metadata.COMMENTS);
+ setIfPresent(metadata, "Data BitsPerSample", Metadata.BITS_PER_SAMPLE);
} catch (IIOException e) {
throw new TikaException(type + " parse error", e);
}
@@ -95,6 +105,21 @@ public class ImageParser implements Pars
throws IOException, SAXException, TikaException {
parse(stream, handler, metadata, new ParseContext());
}
+
+ private static void setIfPresent(Metadata metadata, String imageIOkey, String tikaKey) {
+ if(metadata.get(imageIOkey) != null) {
+ metadata.set(tikaKey, metadata.get(imageIOkey));
+ }
+ }
+ private static void setIfPresent(Metadata metadata, String imageIOkey, Property tikaProp) {
+ if(metadata.get(imageIOkey) != null) {
+ String v = metadata.get(imageIOkey);
+ if(v.endsWith(" ")) {
+ v = v.substring(0, v.lastIndexOf(' '));
+ }
+ metadata.set(tikaProp, v);
+ }
+ }
private static void loadMetadata(IIOMetadata imageMetadata, Metadata metadata) {
String[] names = imageMetadata.getMetadataFormatNames();
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java Mon Jun 28 13:59:08 2010
@@ -19,9 +19,12 @@ package org.apache.tika.parser.image;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.xml.sax.SAXException;
import com.drew.imaging.tiff.TiffMetadataReader;
@@ -30,15 +33,15 @@ import com.drew.metadata.Directory;
import com.drew.metadata.MetadataException;
import com.drew.metadata.Tag;
-class TiffExtractor {
+public class TiffExtractor {
private final Metadata metadata;
- public TiffExtractor(Metadata metadata) {
+ protected TiffExtractor(Metadata metadata) {
this.metadata = metadata;
}
- public void parse(InputStream stream)
+ protected void parse(InputStream stream)
throws IOException, SAXException, TikaException {
try {
com.drew.metadata.Metadata tiffMetadata =
@@ -52,6 +55,7 @@ class TiffExtractor {
while (tags.hasNext()) {
Tag tag = (Tag)tags.next();
metadata.set(tag.getTagName(), tag.getDescription());
+ handleCommonImageTags(metadata, tag);
}
}
} catch (TiffProcessingException e) {
@@ -61,4 +65,55 @@ class TiffExtractor {
}
}
+
+ /**
+ * Maps common TIFF and EXIF tags onto the Tika
+ * TIFF image metadata namespace.
+ */
+ public static void handleCommonImageTags(Metadata metadata, Tag tag) throws MetadataException {
+ // Core tags
+ if(tag.getTagName().equals("Date/Time") ||
+ tag.getTagType() == 306) {
+ // Ensure it's in the right format
+ String date = tag.getDescription();
+ int splitAt = date.indexOf(' ');
+ if(splitAt > -1) {
+ date = date.substring(0, splitAt).replace(':', '/') +
+ date.substring(splitAt);
+ }
+ metadata.set(Metadata.DATE, date);
+ return;
+ }
+ if(tag.getTagName().equals("Keywords") ||
+ tag.getTagType() == 537) {
+ metadata.set(Metadata.KEYWORDS, tag.getDescription());
+ }
+
+ // EXIF / TIFF Tags
+ Property key = null;
+ if(tag.getTagName().equals("Image Width") ||
+ tag.getTagType() == 256) {
+ key = Metadata.IMAGE_WIDTH;
+ }
+ if(tag.getTagName().equals("Image Height") ||
+ tag.getTagType() == 257) {
+ key = Metadata.IMAGE_LENGTH;
+ }
+ if(tag.getTagName().equals("Data Precision") ||
+ tag.getTagName().equals("Bits Per Sample") ||
+ tag.getTagType() == 258) {
+ key = Metadata.BITS_PER_SAMPLE;
+ }
+ if(tag.getTagType() == 277) {
+ key = Metadata.SAMPLES_PER_PIXEL;
+ }
+
+ if(key != null) {
+ Matcher m = LEADING_NUMBERS.matcher(tag.getDescription());
+ if(m.matches()) {
+ metadata.set(key, m.group(1));
+ }
+ }
+ }
+ private static final Pattern LEADING_NUMBERS = Pattern.compile("(\\d+)\\s*.*");
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java Mon Jun 28 13:59:08 2010
@@ -22,6 +22,7 @@ import java.util.Iterator;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.image.TiffExtractor;
import org.xml.sax.SAXException;
import com.drew.imaging.jpeg.JpegMetadataReader;
@@ -52,6 +53,7 @@ class JpegExtractor {
while (tags.hasNext()) {
Tag tag = (Tag)tags.next();
metadata.set(tag.getTagName(), tag.getDescription());
+ TiffExtractor.handleCommonImageTags(metadata, tag);
}
}
} catch (JpegProcessingException e) {
@@ -60,5 +62,4 @@ class JpegExtractor {
throw new TikaException("Can't read JPEG metadata", e);
}
}
-
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/ImageParserTest.java Mon Jun 28 13:59:08 2010
@@ -43,6 +43,10 @@ public class ImageParserTest extends Tes
assertEquals("0", metadata.get("Dimension HorizontalPhysicalPixelSpacing"));
assertEquals("BI_RGB", metadata.get("Compression CompressionTypeName"));
assertEquals("image/bmp", metadata.get("Content-Type"));
+
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
}
public void testGIF() throws Exception {
@@ -69,6 +73,10 @@ public class ImageParserTest extends Tes
assertEquals("disposalMethod=none, userInputFlag=false, transparentColorFlag=false, delayTime=0, transparentColorIndex=0", metadata.get("GraphicControlExtension"));
assertEquals("0", metadata.get("Dimension VerticalPixelOffset"));
assertEquals("image/gif", metadata.get("Content-Type"));
+
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(Metadata.COMMENTS));
}
public void testJPEG() throws Exception {
@@ -100,6 +108,10 @@ public class ImageParserTest extends Tes
assertEquals("keyword=comment, value=Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get("Text TextEntry"));
assertEquals("image/jpeg", metadata.get("Content-Type"));
assertEquals("process=0, samplePrecision=8, numLines=75, samplesPerLine=100, numFrameComponents=3", metadata.get("markerSequence sof"));
+
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(Metadata.COMMENTS));
}
public void testPNG() throws Exception {
@@ -133,6 +145,10 @@ public class ImageParserTest extends Tes
assertEquals("true", metadata.get("Chroma BlackIsZero"));
assertEquals("year=2008, month=5, day=6, hour=6, minute=18, second=47", metadata.get("Document ImageModificationTime"));
assertEquals("image/png", metadata.get("Content-Type"));
+
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
}
// TODO: Add TIFF support
@@ -145,6 +161,11 @@ public class ImageParserTest extends Tes
//
// assertEquals("75", metadata.get("height"));
// assertEquals("100", metadata.get("width"));
+//
+// assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+// assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+// assertEquals("8 8 8", metadata.get(Metadata.BITS_PER_SAMPLE));
+// assertEquals("Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership.", metadata.get(Metadata.COMMENTS));
// }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/image/TiffParserTest.java Mon Jun 28 13:59:08 2010
@@ -38,5 +38,14 @@ public class TiffParserTest extends Test
"more contributor license agreements. See the NOTICE file " +
"distributed with this work for additional information regarding " +
"copyright ownership.", metadata.get("Image Description"));
+
+ // All EXIF/TIFF tags
+ assertEquals("Inch", metadata.get("Resolution Unit"));
+
+ // Core EXIF/TIFF tags
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("75", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals("3", metadata.get(Metadata.SAMPLES_PER_PIXEL));
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=958581&r1=958580&r2=958581&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Mon Jun 28 13:59:08 2010
@@ -33,7 +33,18 @@ public class JpegParserTest extends Test
getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
parser.parse(stream, new DefaultHandler(), metadata);
+ // All EXIF/TIFF tags
assertEquals("Canon EOS 40D", metadata.get("Model"));
+
+ // Core EXIF/TIFF tags
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+
+ // Common tags
+ assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
+ assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
}
}