You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/05/06 07:14:39 UTC

svn commit: r1100061 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/mbox/MboxParser.java main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java

Author: nick
Date: Fri May  6 05:14:39 2011
New Revision: 1100061

URL: http://svn.apache.org/viewvc?rev=1100061&view=rev
Log:
TIKA-656 Update the Outlook parser to handle dates the same way as the other mail parsers

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1100061&r1=1100060&r2=1100061&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Fri May  6 05:14:39 2011
@@ -243,7 +243,7 @@ public class MboxParser extends Abstract
         }
     }
     
-    private Date parseDate(String headerContent) throws ParseException {
+    public static Date parseDate(String headerContent) throws ParseException {
         SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
         return dateFormat.parse(headerContent);
     }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1100061&r1=1100060&r2=1100061&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Fri May  6 05:14:39 2011
@@ -18,6 +18,8 @@ package org.apache.tika.parser.microsoft
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
+import java.text.ParseException;
+import java.util.Date;
 
 import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
 import org.apache.poi.hsmf.MAPIMessage;
@@ -34,6 +36,7 @@ import org.apache.tika.io.TikaInputStrea
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.mbox.MboxParser;
 import org.apache.tika.parser.rtf.RTFParser;
 import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -109,8 +112,9 @@ public class OutlookExtractor extends Ab
            // Date - try two ways to find it
            // First try via the proper chunk
            if(msg.getMessageDate() != null) {
-              metadata.set(Metadata.EDIT_TIME, msg.getMessageDate().getTime().toString());
-              metadata.set(Metadata.LAST_SAVED, msg.getMessageDate().getTime().toString());
+              metadata.set(Metadata.DATE, msg.getMessageDate().getTime());
+              metadata.set(Metadata.CREATION_DATE, msg.getMessageDate().getTime());
+              metadata.set(Metadata.LAST_SAVED, msg.getMessageDate().getTime());
            } else {
               try {
                  // Failing that try via the raw headers 
@@ -118,9 +122,20 @@ public class OutlookExtractor extends Ab
                  if(headers != null && headers.length > 0) {
                      for(String header: headers) {
                         if(header.toLowerCase().startsWith("date:")) {
-                            String date = header.substring(header.indexOf(':')+1);
-                            metadata.set(Metadata.EDIT_TIME, date);
-                            metadata.set(Metadata.LAST_SAVED, date);
+                            String date = header.substring(header.indexOf(':')+1).trim();
+                            
+                            // See if we can parse it as a normal mail date
+                            try {
+                               Date d = MboxParser.parseDate(date);
+                               metadata.set(Metadata.DATE, d);
+                               metadata.set(Metadata.CREATION_DATE, d);
+                               metadata.set(Metadata.LAST_SAVED, d);
+                            } catch(ParseException e) {
+                               // Store it as-is, and hope for the best...
+                               metadata.set(Metadata.DATE, date);
+                               metadata.set(Metadata.CREATION_DATE, date);
+                               metadata.set(Metadata.LAST_SAVED, date);
+                            }
                             break;
                         }
                      }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1100061&r1=1100060&r2=1100061&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Fri May  6 05:14:39 2011
@@ -59,6 +59,11 @@ public class OutlookParserTest extends T
         assertEquals(
                 "L'\u00C9quipe Microsoft Outlook Express",
                 metadata.get(Metadata.AUTHOR));
+        
+        // Stored as Thu, 5 Apr 2007 09:26:06 -0700
+        assertEquals(
+                "2007-04-05T16:26:06Z",
+                metadata.get(Metadata.DATE));
 
         String content = handler.toString();
         assertTrue(content.contains(""));