You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/16 01:26:25 UTC

svn commit: r1652319 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/mail/MailContentHandler.java test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

Author: nick
Date: Fri Jan 16 00:26:24 2015
New Revision: 1652319

URL: http://svn.apache.org/r1652319
Log:
TIKA-1222 For RFC822 mails, start to prefer a EmbeddedDocumentExtractor to a Parser for handling embedded resources, but retain the Parser use if not for backwards compatibility

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1652319&r1=1652318&r2=1652319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Fri Jan 16 00:26:24 2015
@@ -38,6 +38,7 @@ import org.apache.james.mime4j.stream.Fi
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
@@ -73,21 +74,29 @@ class MailContentHandler implements Cont
 
     public void body(BodyDescriptor body, InputStream is) throws MimeException,
             IOException {
-        // Work out the best underlying parser for the part
-        // Check first for a specified AutoDetectParser (which may have a
-        //  specific Config), then a recursing parser, and finally the default
-        Parser parser = context.get(AutoDetectParser.class);
-        if (parser == null) {
-           parser = context.get(Parser.class);
-        }
-        if (parser == null) {
-           if (tikaConfig == null) {
-              tikaConfig = context.get(TikaConfig.class);
-              if (tikaConfig == null) {
-                 tikaConfig = TikaConfig.getDefaultConfig();
-              }
-           }
-           parser = tikaConfig.getParser();
+        // Was an EmbeddedDocumentExtractor given to explicitly handle/process
+        //  the attachments in the file?
+        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+        
+        // If there's no EmbeddedDocumentExtractor, then try using a normal parser
+        // This will ensure that the contents are made available to the user, so
+        //  the see the text, but without fine-grained control/extraction
+        Parser parser = null;
+        if (ex == null) {
+            // If the user gave a parser, use that, if not the default
+            parser = context.get(AutoDetectParser.class);
+            if (parser == null) {
+               parser = context.get(Parser.class);
+            }
+            if (parser == null) {
+               if (tikaConfig == null) {
+                  tikaConfig = context.get(TikaConfig.class);
+                  if (tikaConfig == null) {
+                     tikaConfig = TikaConfig.getDefaultConfig();
+                  }
+               }
+               parser = tikaConfig.getParser();
+            }
         }
 
         // use a different metadata object
@@ -100,7 +109,12 @@ class MailContentHandler implements Cont
 
         try {
             BodyContentHandler bch = new BodyContentHandler(handler);
-            parser.parse(is, new EmbeddedContentHandler(bch), submd, context);
+            if (ex != null) {
+                if (ex.shouldParseEmbedded(submd))
+                    ex.parseEmbedded(is, bch, submd, false);
+            } else {
+                parser.parse(is, new EmbeddedContentHandler(bch), submd, context);
+            }
         } catch (EncryptedDocumentException ede) {
             // Skip this encrypted attachment and continue
         } catch (SAXException e) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1652319&r1=1652318&r2=1652319&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Fri Jan 16 00:26:24 2015
@@ -283,6 +283,15 @@ public class RFC822ParserTest {
         // Check that we also get the zip's contents as well
         // TODO
     }
+    
+    /**
+     * TIKA-1222 When requested, ensure that the various attachments of
+     *  the mail come through properly as embedded resources
+     */
+    @Test
+    public void testGetAttachmentsAsEmbeddedResources() throws Exception {
+        // TODO
+    }
 
     private static InputStream getStream(String name) {
         InputStream stream = Thread.currentThread().getContextClassLoader()