You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/06/29 14:06:19 UTC

svn commit: r958942 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/html/ main/java/org/apache/tika/parser/image/ main/java/org/apache/tika/parser/jpeg/ test/java/org/apache/tika/parser/html/ test/java/org/apache/tika/parser/jpeg/ t...

Author: nick
Date: Tue Jun 29 12:06:19 2010
New Revision: 958942

URL: http://svn.apache.org/viewvc?rev=958942&view=rev
Log:
Enable extraction of longitude and latitude from JPEG/Tiff files (via the EXIF tags), and HTML (via the ICBM meta tag), to the new geographic metadata namespace

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlHandler.java Tue Jun 29 12:06:19 2010
@@ -18,6 +18,8 @@ package org.apache.tika.parser.html;
 
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.sax.TextContentHandler;
@@ -93,9 +95,21 @@ class HtmlHandler extends TextContentHan
                     xhtml.startElement(uri, local, "meta", atts);
                 }
                 if (atts.getValue("name") != null) {
+                    // Record the meta tag in the metadata
                     metadata.set(
                             atts.getValue("name"),
                             atts.getValue("content"));
+                    // Normalise if possible
+                    if(atts.getValue("name").equalsIgnoreCase("ICBM")) {
+                        Matcher m = Pattern.compile(
+                              "\\s*(-?\\d+\\.\\d+)[,\\s]+(-?\\d+\\.\\d+)\\s*"
+                        ).matcher(atts.getValue("content"));
+                        if(m.matches()) {
+                            metadata.set(Metadata.LATITUDE, m.group(1));
+                            metadata.set(Metadata.LONGITUDE, m.group(2));
+                        }
+                    }
+                    // Allow downstream processing
                     xhtml.startElement(uri, local, "meta", atts);
                 }
             } else if ("BASE".equals(name) && atts.getValue("href") != null) {

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/image/TiffExtractor.java Tue Jun 29 12:06:19 2010
@@ -18,6 +18,7 @@ package org.apache.tika.parser.image;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.text.DecimalFormat;
 import java.util.Iterator;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -57,6 +58,7 @@ public class TiffExtractor {
                     metadata.set(tag.getTagName(), tag.getDescription());
                     handleCommonImageTags(metadata, tag);
                 }
+                handleGeoImageTags(metadata);
             }
         } catch (TiffProcessingException e) {
             throw new TikaException("Can't read TIFF metadata", e);
@@ -64,6 +66,52 @@ public class TiffExtractor {
             throw new TikaException("Can't read TIFF metadata", e);
         }
     }
+    
+    /**
+     * Maps EXIF Geo Tags onto the Tika Geo metadata namespace.
+     * Needs to be run at the end, because the GPS information
+     *  is spread across several EXIF tags.
+     */
+    public static void handleGeoImageTags(Metadata metadata) {
+	String lat = metadata.get("GPS Latitude");
+	String latNS = metadata.get("GPS Latitude Ref");
+	if(lat != null) {
+	    Double latitude = parseHMS(lat);
+	    if(latitude != null) {
+		if(latNS != null && latNS.equalsIgnoreCase("S") &&
+			latitude > 0) {
+		    latitude *= -1;
+		}
+		metadata.set(Metadata.LATITUDE, LAT_LONG_FORMAT.format(latitude)); 
+	    }
+	}
+	
+	String lng = metadata.get("GPS Longitude");
+	String lngEW = metadata.get("GPS Longitude Ref");
+	if(lng != null) {
+	    Double longitude = parseHMS(lng);
+	    if(longitude != null) {
+		if(lngEW != null && lngEW.equalsIgnoreCase("W") &&
+			longitude > 0) {
+		    longitude *= -1;
+		}
+		metadata.set(Metadata.LONGITUDE, LAT_LONG_FORMAT.format(longitude));
+	    }
+	}
+    }
+    private static Double parseHMS(String hms) {
+       Matcher m = HOURS_MINUTES_SECONDS.matcher(hms);
+       if(m.matches()) {
+          double value = 
+            Integer.parseInt(m.group(1)) +
+            (Integer.parseInt(m.group(2))/60.0) +
+            (Double.parseDouble(m.group(3))/60.0/60.0);
+          return value;
+       }
+       return null;
+    }
+    private static final Pattern HOURS_MINUTES_SECONDS = Pattern.compile("(-?\\d+)\"(\\d+)'(\\d+\\.?\\d*)");
+    private static final DecimalFormat LAT_LONG_FORMAT = new DecimalFormat("##0.0####");
 
 
     /**

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/jpeg/JpegExtractor.java Tue Jun 29 12:06:19 2010
@@ -55,6 +55,7 @@ class JpegExtractor {
                     metadata.set(tag.getTagName(), tag.getDescription());
                     TiffExtractor.handleCommonImageTags(metadata, tag);
                 }
+                TiffExtractor.handleGeoImageTags(metadata);
             }
         } catch (JpegProcessingException e) {
             throw new TikaException("Can't read JPEG metadata", e);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java Tue Jun 29 12:06:19 2010
@@ -71,6 +71,9 @@ public class HtmlParserTest extends Test
                 "Title : Test Indexation Html", metadata.get(Metadata.TITLE));
         assertEquals("Tika Developers", metadata.get("Author"));
         assertEquals("5", metadata.get("refresh"));
+        
+        assertEquals("51.2312", metadata.get(Metadata.LATITUDE));
+        assertEquals("-5.1987", metadata.get(Metadata.LONGITUDE));
 
         assertEquals("http://www.apache.org/", href.toString());
         assertEquals("test-anchor", name.toString());

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/jpeg/JpegParserTest.java Tue Jun 29 12:06:19 2010
@@ -17,6 +17,8 @@
 package org.apache.tika.parser.jpeg;
 
 import junit.framework.TestCase;
+
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.metadata.Metadata;
 import org.xml.sax.helpers.DefaultHandler;
@@ -31,7 +33,7 @@ public class JpegParserTest extends Test
         metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
         InputStream stream =
             getClass().getResourceAsStream("/test-documents/testJPEG_EXIF.jpg");
-        parser.parse(stream, new DefaultHandler(), metadata);
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
 
         // All EXIF/TIFF tags
         assertEquals("Canon EOS 40D", metadata.get("Model"));
@@ -47,4 +49,28 @@ public class JpegParserTest extends Test
         assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
     }
 
+    public void testJPEGGeo() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        InputStream stream =
+            getClass().getResourceAsStream("/test-documents/testJPEG_GEO.jpg");
+        parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());
+        
+        // Geo tags
+        assertEquals("12.54321", metadata.get(Metadata.LATITUDE));
+        assertEquals("-54.1234", metadata.get(Metadata.LONGITUDE));
+
+        // All EXIF/TIFF tags
+        assertEquals("Canon EOS 40D", metadata.get("Model"));
+        
+        // Core EXIF/TIFF tags
+        assertEquals("100", metadata.get(Metadata.IMAGE_WIDTH));
+        assertEquals("68", metadata.get(Metadata.IMAGE_LENGTH));
+        assertEquals("8", metadata.get(Metadata.BITS_PER_SAMPLE));
+        assertEquals(null, metadata.get(Metadata.SAMPLES_PER_PIXEL));
+        
+        // Common tags
+        assertEquals("2009/10/02 23:02:49", metadata.get(Metadata.DATE));
+        assertEquals("canon-55-250 moscow-birds serbor", metadata.get(Metadata.KEYWORDS));
+    }
 }

Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html?rev=958942&r1=958941&r2=958942&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html (original)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testHTML.html Tue Jun 29 12:06:19 2010
@@ -18,10 +18,11 @@
 	<head>
         <title>Title : Test Indexation Html</title>
         <meta name="Author" content="Tika Developers">
+        <meta name="ICBM" content="51.2312, -5.1987">
         <meta http-equiv="refresh" content="5">
     </head>
 	<body>
 		<h1><a name="test-anchor"></a>Test Indexation Html</h1>
 		<p><a href="http://www.apache.org/">Indexation</a> du fichier</p>
 	</body>
-</html>
\ No newline at end of file
+</html>

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg?rev=958942&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testJPEG_GEO.jpg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream