You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/04/14 12:06:33 UTC
svn commit: r933903 - in
/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft:
OfficeParser.java OutlookExtractor.java
Author: jukka
Date: Wed Apr 14 10:06:33 2010
New Revision: 933903
URL: http://svn.apache.org/viewvc?rev=933903&view=rev
Log:
TIKA-396: Parser Attachements from Outlook Messages
Use the context parser for attachments
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=933903&r1=933902&r2=933903&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Wed Apr 14 10:06:33 2010
@@ -133,7 +133,9 @@ public class OfficeParser implements Par
// TODO: Cleaner mechanism for detecting Outlook
outlookExtracted = true;
setType(metadata, "application/vnd.ms-outlook");
- new OutlookExtractor(filesystem).parse(xhtml, metadata);
+ OutlookExtractor extractor =
+ new OutlookExtractor(filesystem, context);
+ extractor.parse(xhtml, metadata);
}
}
}
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=933903&r1=933902&r2=933903&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Wed Apr 14 10:06:33 2010
@@ -27,7 +27,9 @@ import org.apache.poi.hsmf.parsers.POIFS
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -42,18 +44,21 @@ class OutlookExtractor {
private final POIFSChunkParser parser;
- private final AutoDetectParser attachmentParser;
+ private final ParseContext context;
- public OutlookExtractor(POIFSFileSystem filesystem) throws TikaException {
+ public OutlookExtractor(
+ POIFSFileSystem filesystem, ParseContext context)
+ throws TikaException {
try {
+ this.context = context;
this.parser = new POIFSChunkParser(filesystem);
- this.attachmentParser = new AutoDetectParser();
this.chunks = parser.identifyChunks();
} catch (IOException e) {
throw new TikaException("Failed to parse Outlook chunks", e);
}
}
+ @SuppressWarnings("unchecked")
public void parse(XHTMLContentHandler xhtml, Metadata metadata)
throws TikaException, SAXException {
String subject = getChunk(chunks.subjectChunk);
@@ -85,11 +90,11 @@ class OutlookExtractor {
}
try {
// Use the delegate parser to parse this entry
- attachmentParser.parse(
+ context.get(Parser.class, EmptyParser.INSTANCE).parse(
attachments.get(key),
- new EmbeddedContentHandler(
- new BodyContentHandler(xhtml)),
- entrydata);
+ new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+ entrydata,
+ context);
} catch (Exception e) {
// Could not parse the entry, just skip the content
}