You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/06/29 14:06:19 UTC
svn commit: r958942 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/html/
main/java/org/apache/tika/parser/image/
main/java/org/apache/tika/parser/jpeg/ test/java/org/apache/tika/parser/html/
test/java/org/apache/tika/parser/jpeg/ t...
Author: nick
Date: Tue Jun 29 12:06:19 2010
New Revision: 958942
URL: http://svn.apache.org/viewvc?rev=958942&view=rev
Log:
Enable extraction of longitude and latitude from JPEG/Tiff files (via the EXIF tags), and HTML (via the ICBM meta tag), to the new geographic metadata namespace
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Tue Jun 29 12:06:19 2010
@@ -18,6 +18,8 @@ package org.apache.tika.parser.html;
import java.net.MalformedURLException;
import java.net.URL;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.TextContentHandler;
@@ -93,9 +95,21 @@ class HtmlHandler extends TextContentHan
xhtml.startElement(uri, local, "meta", atts);
}
if (atts.getValue("name") != null) {
+ // Record the meta tag in the metadata
metadata.set(
atts.getValue("name"),
atts.getValue("content"));
+ // Normalise if possible
+ if(atts.getValue("name").equalsIgnoreCase("ICBM")) {
+ Matcher m = Pattern.compile(
+ "\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*"
+ ).matcher(atts.getValue("content"));
+ if(m.matches()) {
+ metadata.set(Metadata.LATITUDE, m.group(1));
+ metadata.set(Metadata.LONGITUDE, m.group(2));
+ }
+ }
+ // Allow downstream processing
xhtml.startElement(uri, local, "meta", atts);
}
} else if ("BASE".equals(name) && atts.getValue("href") != null) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java Tue Jun 29 12:06:19 2010
@@ -18,6 +18,7 @@ package org.apache.tika.parser.image;
import java.io.IOException;
import java.io.InputStream;
+import java.text.DecimalFormat;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -57,6 +58,7 @@ public class TiffExtractor {
metadata.set(tag.getTagName(), tag.getDescription());
handleCommonImageTags(metadata, tag);
}
+ handleGeoImageTags(metadata);
}
} catch (TiffProcessingException e) {
throw new TikaException("Can't read TIFF metadata", e);
@@ -64,6 +66,52 @@ public class TiffExtractor {
throw new TikaException("Can't read TIFF metadata", e);
}
}
+
+ /**
+ * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
+ * Needs to be run at the end, because the GPS information
+ * is spread across several EXIF tags.
+ */
+ public static void handleGeoImageTags(Metadata metadata) {
+ String lat = metadata.get("GPS Latitude");
+ String latNS = metadata.get("GPS Latitude Ref");
+ if(lat != null) {
+ Double latitude = parseHMS(lat);
+ if(latitude != null) {
+ if(latNS != null && latNS.equalsIgnoreCase("S") &&
+ latitude > 0) {
+ latitude *= -1;
+ }
+ metadata.set(Metadata.LATITUDE, LAT_LONG_FORMAT.format(latitude));
+ }
+ }
+
+ String lng = metadata.get("GPS Longitude");
+ String lngEW = metadata.get("GPS Longitude Ref");
+ if(lng != null) {
+ Double longitude = parseHMS(lng);
+ if(longitude != null) {
+ if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
+ longitude > 0) {
+ longitude *= -1;
+ }
+ metadata.set(Metadata.LONGITUDE, LAT_LONG_FORMAT.format(longitude));
+ }
+ }
+ }
+ private static Double parseHMS(String hms) {
+ Matcher m = HOURS_MINUTES_SECONDS.matcher(hms);
+ if(m.matches()) {
+ double value =
+ Integer.parseInt(m.group(1)) +
+ (Integer.parseInt(m.group(2))/60.0) +
+ (Double.parseDouble(m.group(3))/60.0/60.0);
+ return value;
+ }
+ return null;
+ }
+ private static final Pattern HOURS_MINUTES_SECONDS = Pattern.compile("(-?\\d+)\"(\\d+)'(\\d+\\.?\\d*)");
+ private static final DecimalFormat LAT_LONG_FORMAT = new DecimalFormat("##0.0####");
/**
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java Tue Jun 29 12:06:19 2010
@@ -55,6 +55,7 @@ class JpegExtractor {
metadata.set(tag.getTagName(), tag.getDescription());
TiffExtractor.handleCommonImageTags(metadata, tag);
}
+ TiffExtractor.handleGeoImageTags(metadata);
}
} catch (JpegProcessingException e) {
throw new TikaException("Can't read JPEG metadata", e);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Tue Jun 29 12:06:19 2010
@@ -71,6 +71,9 @@ public class HtmlParserTest extends Test
"Title : Test Indexation Html", metadata.get(Metadata.TITLE));
assertEquals("Tika Developers", metadata.get("Author"));
assertEquals("5", metadata.get("refresh"));
+
+ assertEquals("51.2312", metadata.get(Metadata.LATITUDE));
+ assertEquals("-5.1987", metadata.get(Metadata.LONGITUDE));
assertEquals("http://www.apache.org/", href.toString());
assertEquals("test-anchor", name.toString());
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Tue Jun 29 12:06:19 2010
@@ -17,6 +17,8 @@
package org.apache.tika.parser.jpeg;
import junit.framework.TestCase;
+
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.metadata.Metadata;
import org.xml.sax.helpers.DefaultHandler;
@@ -31,7 +33,7 @@ public class JpegParserTest extends Test
metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
InputStream stream =
getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
- parser.parse(stream, new DefaultHandler(), metadata);
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
// All EXIF/TIFF tags
assertEquals("Canon EOS 40D", metadata.get("Model"));
@@ -47,4 +49,28 @@ public class JpegParserTest extends Test
assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
}
+ public void testJPEGGeo() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ InputStream stream =
+ getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg");
+ parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+
+ // Geo tags
+ assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
+ assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
+
+ // All EXIF/TIFF tags
+ assertEquals("Canon EOS 40D", metadata.get("Model"));
+
+ // Core EXIF/TIFF tags
+ assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+ assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+ assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+ assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+
+ // Common tags
+ assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
+ assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
+ }
}
Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html (original)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html Tue Jun 29 12:06:19 2010
@@ -18,10 +18,11 @@
<head>
<title>Title : Test Indexation Html</title>
<meta name="Author" content="Tika Developers">
+ <meta name="ICBM" content="51.2312, -5.1987">
<meta http-equiv="refresh" content="5">
</head>
<body>
<h1><a name="test-anchor"></a>Test Indexation Html</h1>
<p><a href="http://www.apache.org/">Indexation</a> du fichier</p>
</body>
-</html>
\ No newline at end of file
+</html>
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg?rev=958942&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream