You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/05/06 07:14:39 UTC
svn commit: r1100061 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/mbox/MboxParser.java
main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Author: nick
Date: Fri May 6 05:14:39 2011
New Revision: 1100061
URL: http://svn.apache.org/viewvc?rev=1100061&view=rev
Log:
TIKA-656 Update the Outlook parser to handle dates the same way as the other mail parsers
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java?rev=1100061&r1=1100060&r2=1100061&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mbox/MboxParser.java Fri May 6 05:14:39 2011
@@ -243,7 +243,7 @@ public class MboxParser extends Abstract
}
}
- private Date parseDate(String headerContent) throws ParseException {
+ public static Date parseDate(String headerContent) throws ParseException {
SimpleDateFormat dateFormat = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss Z", Locale.US);
return dateFormat.parse(headerContent);
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1100061&r1=1100060&r2=1100061&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Fri May 6 05:14:39 2011
@@ -18,6 +18,8 @@ package org.apache.tika.parser.microsoft
import java.io.ByteArrayInputStream;
import java.io.IOException;
+import java.text.ParseException;
+import java.util.Date;
import org.apache.poi.hmef.attribute.MAPIRtfAttribute;
import org.apache.poi.hsmf.MAPIMessage;
@@ -34,6 +36,7 @@ import org.apache.tika.io.TikaInputStrea
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.mbox.MboxParser;
import org.apache.tika.parser.rtf.RTFParser;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -109,8 +112,9 @@ public class OutlookExtractor extends Ab
// Date - try two ways to find it
// First try via the proper chunk
if(msg.getMessageDate() != null) {
- metadata.set(Metadata.EDIT_TIME, msg.getMessageDate().getTime().toString());
- metadata.set(Metadata.LAST_SAVED, msg.getMessageDate().getTime().toString());
+ metadata.set(Metadata.DATE, msg.getMessageDate().getTime());
+ metadata.set(Metadata.CREATION_DATE, msg.getMessageDate().getTime());
+ metadata.set(Metadata.LAST_SAVED, msg.getMessageDate().getTime());
} else {
try {
// Failing that try via the raw headers
@@ -118,9 +122,20 @@ public class OutlookExtractor extends Ab
if(headers != null && headers.length > 0) {
for(String header: headers) {
if(header.toLowerCase().startsWith("date:")) {
- String date = header.substring(header.indexOf(':')+1);
- metadata.set(Metadata.EDIT_TIME, date);
- metadata.set(Metadata.LAST_SAVED, date);
+ String date = header.substring(header.indexOf(':')+1).trim();
+
+ // See if we can parse it as a normal mail date
+ try {
+ Date d = MboxParser.parseDate(date);
+ metadata.set(Metadata.DATE, d);
+ metadata.set(Metadata.CREATION_DATE, d);
+ metadata.set(Metadata.LAST_SAVED, d);
+ } catch(ParseException e) {
+ // Store it as-is, and hope for the best...
+ metadata.set(Metadata.DATE, date);
+ metadata.set(Metadata.CREATION_DATE, date);
+ metadata.set(Metadata.LAST_SAVED, date);
+ }
break;
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1100061&r1=1100060&r2=1100061&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Fri May 6 05:14:39 2011
@@ -59,6 +59,11 @@ public class OutlookParserTest extends T
assertEquals(
"L'\u00C9quipe Microsoft Outlook Express",
metadata.get(Metadata.AUTHOR));
+
+ // Stored as Thu, 5 Apr 2007 09:26:06 -0700
+ assertEquals(
+ "2007-04-05T16:26:06Z",
+ metadata.get(Metadata.DATE));
String content = handler.toString();
assertTrue(content.contains(""));