You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2010/04/08 15:30:02 UTC
svn commit: r931931 -
/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
Author: dmeikle
Date: Thu Apr 8 13:30:02 2010
New Revision: 931931
URL: http://svn.apache.org/viewvc?rev=931931&view=rev
Log:
TIKA-396: Parse Attachement included within Outlook Message.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=931931&r1=931930&r2=931931&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Thu Apr 8 13:30:02 2010
@@ -17,6 +17,8 @@
package org.apache.tika.parser.microsoft;
import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
import org.apache.poi.hsmf.datatypes.Chunks;
import org.apache.poi.hsmf.datatypes.StringChunk;
@@ -25,6 +27,9 @@ import org.apache.poi.hsmf.parsers.POIFS
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -37,9 +42,12 @@ class OutlookExtractor {
private final POIFSChunkParser parser;
+ private final AutoDetectParser attachmentParser;
+
public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
try {
this.parser = new POIFSChunkParser(filesystem);
+ this.attachmentParser = new AutoDetectParser();
this.chunks = parser.identifyChunks();
} catch (IOException e) {
throw new TikaException("Failed to parse Outlook chunks", e);
@@ -65,6 +73,28 @@ class OutlookExtractor {
xhtml.endElement("dl");
xhtml.element("p", getChunk(chunks.textBodyChunk));
+
+ Map<String, InputStream> attachments = parser.getAttachmentList();
+ for (String key : attachments.keySet())
+ {
+ xhtml.startElement("div", "class", "attachment-entry");
+ Metadata entrydata = new Metadata();
+ if (key != null && key.length() > 0) {
+ entrydata.set(Metadata.RESOURCE_NAME_KEY, key);
+ xhtml.element("h1", key);
+ }
+ try {
+ // Use the delegate parser to parse this entry
+ attachmentParser.parse(
+ attachments.get(key),
+ new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml)),
+ entrydata);
+ } catch (Exception e) {
+ // Could not parse the entry, just skip the content
+ }
+ xhtml.endElement("div");
+ }
}
private void header(XHTMLContentHandler xhtml, String key, String value)