You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/04/14 12:06:33 UTC

svn commit: r933903 - in /lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft: OfficeParser.java OutlookExtractor.java

Author: jukka
Date: Wed Apr 14 10:06:33 2010
New Revision: 933903

URL: http://svn.apache.org/viewvc?rev=933903&view=rev
Log:
TIKA-396: Parser Attachements from Outlook Messages

Use the context parser for attachments

Modified:
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=933903&r1=933902&r2=933903&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Wed Apr 14 10:06:33 2010
@@ -133,7 +133,9 @@ public class OfficeParser implements Par
                    // TODO: Cleaner mechanism for detecting Outlook
                    outlookExtracted = true;
                    setType(metadata, "application/vnd.ms-outlook");
-                   new OutlookExtractor(filesystem).parse(xhtml, metadata);
+                   OutlookExtractor extractor =
+                       new OutlookExtractor(filesystem, context);
+                   extractor.parse(xhtml, metadata);
                }
             }
         }

Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=933903&r1=933902&r2=933903&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Wed Apr 14 10:06:33 2010
@@ -27,7 +27,9 @@ import org.apache.poi.hsmf.parsers.POIFS
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -42,18 +44,21 @@ class OutlookExtractor {
 
     private final POIFSChunkParser parser;
 
-    private final AutoDetectParser attachmentParser;
+    private final ParseContext context;
 
-    public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
+    public OutlookExtractor(
+            POIFSFileSystem filesystem, ParseContext context)
+            throws TikaException {
         try {
+            this.context = context;
             this.parser = new POIFSChunkParser(filesystem);
-            this.attachmentParser = new AutoDetectParser();
             this.chunks = parser.identifyChunks();
         } catch (IOException e) {
             throw new TikaException("Failed to parse Outlook chunks", e);
         }
     }
 
+    @SuppressWarnings("unchecked")
     public void parse(XHTMLContentHandler xhtml, Metadata metadata)
             throws TikaException, SAXException {
         String subject = getChunk(chunks.subjectChunk);
@@ -85,11 +90,11 @@ class OutlookExtractor {
             }
             try {
                 // Use the delegate parser to parse this entry
-                attachmentParser.parse(
+                context.get(Parser.class, EmptyParser.INSTANCE).parse(
                         attachments.get(key),
-                        new EmbeddedContentHandler(
-                                new BodyContentHandler(xhtml)),
-                                entrydata);
+                        new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+                        entrydata,
+                        context);
             } catch (Exception e) {
                 // Could not parse the entry, just skip the content
             }