You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2010/04/08 15:30:02 UTC

svn commit: r931931 - /lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java

Author: dmeikle
Date: Thu Apr  8 13:30:02 2010
New Revision: 931931

URL: http://svn.apache.org/viewvc?rev=931931&view=rev
Log:
TIKA-396: Parse Attachement included within Outlook Message.

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=931931&r1=931930&r2=931931&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Thu Apr  8 13:30:02 2010
@@ -17,6 +17,8 @@
 package org.apache.tika.parser.microsoft;
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.util.Map;
 
 import org.apache.poi.hsmf.datatypes.Chunks;
 import org.apache.poi.hsmf.datatypes.StringChunk;
@@ -25,6 +27,9 @@ import org.apache.poi.hsmf.parsers.POIFS
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
@@ -37,9 +42,12 @@ class OutlookExtractor {
 
     private final POIFSChunkParser parser;
 
+    private final AutoDetectParser attachmentParser;
+
     public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
         try {
             this.parser = new POIFSChunkParser(filesystem);
+            this.attachmentParser = new AutoDetectParser();
             this.chunks = parser.identifyChunks();
         } catch (IOException e) {
             throw new TikaException("Failed to parse Outlook chunks", e);
@@ -65,6 +73,28 @@ class OutlookExtractor {
         xhtml.endElement("dl");
 
         xhtml.element("p", getChunk(chunks.textBodyChunk));
+
+        Map<String, InputStream> attachments =  parser.getAttachmentList();
+        for (String key : attachments.keySet())
+        {
+            xhtml.startElement("div", "class", "attachment-entry");
+            Metadata entrydata = new Metadata();
+            if (key != null && key.length() > 0) {
+                entrydata.set(Metadata.RESOURCE_NAME_KEY, key);
+                xhtml.element("h1", key);
+            }
+            try {
+                // Use the delegate parser to parse this entry
+                attachmentParser.parse(
+                        attachments.get(key),
+                        new EmbeddedContentHandler(
+                                new BodyContentHandler(xhtml)),
+                                entrydata);
+            } catch (Exception e) {
+                // Could not parse the entry, just skip the content
+            }
+            xhtml.endElement("div");
+        }
     }
 
     private void header(XHTMLContentHandler xhtml, String key, String value)