You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/16 01:26:25 UTC
svn commit: r1652319 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/mail/MailContentHandler.java
test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
Author: nick
Date: Fri Jan 16 00:26:24 2015
New Revision: 1652319
URL: http://svn.apache.org/r1652319
Log:
TIKA-1222 For RFC822 mails, start to prefer a EmbeddedDocumentExtractor to a Parser for handling embedded resources, but retain the Parser use if not for backwards compatibility
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1652319&r1=1652318&r2=1652319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Fri Jan 16 00:26:24 2015
@@ -38,6 +38,7 @@ import org.apache.james.mime4j.stream.Fi
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
@@ -73,21 +74,29 @@ class MailContentHandler implements Cont
public void body(BodyDescriptor body, InputStream is) throws MimeException,
IOException {
- // Work out the best underlying parser for the part
- // Check first for a specified AutoDetectParser (which may have a
- // specific Config), then a recursing parser, and finally the default
- Parser parser = context.get(AutoDetectParser.class);
- if (parser == null) {
- parser = context.get(Parser.class);
- }
- if (parser == null) {
- if (tikaConfig == null) {
- tikaConfig = context.get(TikaConfig.class);
- if (tikaConfig == null) {
- tikaConfig = TikaConfig.getDefaultConfig();
- }
- }
- parser = tikaConfig.getParser();
+ // Was an EmbeddedDocumentExtractor given to explicitly handle/process
+ // the attachments in the file?
+ EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+ // If there's no EmbeddedDocumentExtractor, then try using a normal parser
+ // This will ensure that the contents are made available to the user, so
+ // the see the text, but without fine-grained control/extraction
+ Parser parser = null;
+ if (ex == null) {
+ // If the user gave a parser, use that, if not the default
+ parser = context.get(AutoDetectParser.class);
+ if (parser == null) {
+ parser = context.get(Parser.class);
+ }
+ if (parser == null) {
+ if (tikaConfig == null) {
+ tikaConfig = context.get(TikaConfig.class);
+ if (tikaConfig == null) {
+ tikaConfig = TikaConfig.getDefaultConfig();
+ }
+ }
+ parser = tikaConfig.getParser();
+ }
}
// use a different metadata object
@@ -100,7 +109,12 @@ class MailContentHandler implements Cont
try {
BodyContentHandler bch = new BodyContentHandler(handler);
- parser.parse(is, new EmbeddedContentHandler(bch), submd, context);
+ if (ex != null) {
+ if (ex.shouldParseEmbedded(submd))
+ ex.parseEmbedded(is, bch, submd, false);
+ } else {
+ parser.parse(is, new EmbeddedContentHandler(bch), submd, context);
+ }
} catch (EncryptedDocumentException ede) {
// Skip this encrypted attachment and continue
} catch (SAXException e) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1652319&r1=1652318&r2=1652319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Fri Jan 16 00:26:24 2015
@@ -283,6 +283,15 @@ public class RFC822ParserTest {
// Check that we also get the zip's contents as well
// TODO
}
+
+ /**
+ * TIKA-1222 When requested, ensure that the various attachments of
+ * the mail come through properly as embedded resources
+ */
+ @Test
+ public void testGetAttachmentsAsEmbeddedResources() throws Exception {
+ // TODO
+ }
private static InputStream getStream(String name) {
InputStream stream = Thread.currentThread().getContextClassLoader()