You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2016/07/20 17:15:31 UTC

tika git commit: TIKA-2037 RFC822Parser should wrap the James InputStream of embedded resources to avoid problems with downstream detection or extraction

Repository: tika
Updated Branches:
  refs/heads/master 3ecdc0cb0 -> 952fb54ed


TIKA-2037 RFC822Parser should wrap the James InputStream of embedded resources to avoid problems with downstream detection or extraction


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/952fb54e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/952fb54e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/952fb54e

Branch: refs/heads/master
Commit: 952fb54ed78a2fba07db4653cc674f5641211031
Parents: 3ecdc0c
Author: Nick Burch <ni...@gagravarr.org>
Authored: Wed Jul 20 18:15:25 2016 +0100
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Wed Jul 20 18:15:25 2016 +0100

----------------------------------------------------------------------
 CHANGES.txt                                     |  3 +
 .../tika/parser/mail/MailContentHandler.java    | 13 ++--
 .../tika/parser/mail/RFC822ParserTest.java      | 68 +++++++++++++++++++-
 .../apache/tika/parser/mbox/MboxParserTest.java |  1 -
 4 files changed, 77 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index f6191b4..6ba831f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.14 - ???
 
+  * Avoid mark/reset issues when extracting or detecting embedded resources
+    in RFC822 emails (TIKA-2037).
+
   * Improving accuracy of Tesseract for better extraction of numeric 
     and alphanumeric text from images (TIKA-2021).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 8d16961..6a9bc1b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -16,6 +16,9 @@
  */
 package org.apache.tika.parser.mail;
 
+import static org.apache.tika.utils.DateUtils.MIDDAY;
+import static org.apache.tika.utils.DateUtils.UTC;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.DateFormat;
@@ -44,9 +47,7 @@ import org.apache.james.mime4j.field.LenientFieldParser;
 import org.apache.james.mime4j.parser.ContentHandler;
 import org.apache.james.mime4j.stream.BodyDescriptor;
 import org.apache.james.mime4j.stream.Field;
-import org.apache.james.mime4j.util.ByteSequence;
 import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.AutoDetectReader;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
@@ -58,9 +59,6 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
-import static org.apache.tika.utils.DateUtils.MIDDAY;
-import static org.apache.tika.utils.DateUtils.UTC;
-
 /**
  * Bridge between mime4j's content handler and the generic Sax content handler
  * used by Tika. See
@@ -179,7 +177,10 @@ class MailContentHandler implements ContentHandler {
 
         try {
             if (extractor.shouldParseEmbedded(submd)) {
-                extractor.parseEmbedded(is, handler, submd, false);
+                // Wrap the InputStream before passing on, as the James provided
+                //  one misses many features we might want eg mark/reset
+                TikaInputStream tis = TikaInputStream.get(is);
+                extractor.parseEmbedded(tis, handler, submd, false);
             }
         } catch (SAXException e) {
             throw new MimeException(e);

http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 0d3a2c5..c7fcbfb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -31,24 +31,30 @@ import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
 
 import java.io.ByteArrayInputStream;
+import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.text.DateFormat;
 import java.text.DateFormatSymbols;
 import java.text.SimpleDateFormat;
+import java.util.ArrayList;
 import java.util.Date;
+import java.util.List;
 import java.util.Locale;
-import java.util.TimeZone;
 
 import org.apache.james.mime4j.stream.MimeConfig;
 import org.apache.tika.TikaTest;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParserContainerExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
@@ -58,6 +64,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
 import org.junit.Test;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 public class RFC822ParserTest extends TikaTest {
@@ -496,4 +503,63 @@ public class RFC822ParserTest extends TikaTest {
         assertEquals("I Urge You to Require Notice of Mercury", m.get(TikaCoreProperties.TITLE));
     }
 
+
+    @Test
+    public void testExtractAttachments() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        Parser p = new RFC822Parser();
+        ParseContext context = new ParseContext();
+
+        try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
+            p.parse(stream, handler, metadata, context);
+        }
+
+        // Check we go the metadata
+        assertEquals("Tika Test <XX...@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
+        assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
+        
+        // Try again with attachment detecting and fetching
+        final Detector detector = new DefaultDetector();
+        final Parser extParser = new AutoDetectParser();
+        final List<MediaType> seenTypes = new ArrayList<MediaType>();
+        final List<String> seenText = new ArrayList<String>();
+        EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() {
+            @Override
+            public boolean shouldParseEmbedded(Metadata metadata) {
+                return true;
+            }
+            
+            @Override
+            public void parseEmbedded(InputStream stream, ContentHandler handler,
+                    Metadata metadata, boolean outputHtml) throws SAXException,
+                    IOException {
+                seenTypes.add( detector.detect(stream, metadata) );
+                
+                ContentHandler h = new BodyContentHandler();
+                try {
+                    extParser.parse(stream, h, metadata, new ParseContext());
+                } catch (TikaException e) {
+                    throw new RuntimeException(e);
+                }
+                seenText.add(h.toString());
+            }
+        };
+        context.set(EmbeddedDocumentExtractor.class, ext);
+
+        try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
+            p.parse(stream, handler, metadata, context);
+        }
+        
+        // Check we go the metadata
+        assertEquals("Tika Test <XX...@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
+        assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
+        
+        // Check attachments
+        assertEquals(2, seenTypes.size());
+        assertEquals(2, seenText.size());
+        assertEquals("text/plain", seenTypes.get(0).toString());
+        assertEquals("image/png", seenTypes.get(1).toString());
+        assertEquals("This email has a PNG attachment included in it\n\n", seenText.get(0));
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
index 6ef803d..94c4e70 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
@@ -152,5 +152,4 @@ public class MboxParserTest {
 
         assertContains("When a Mapper completes", handler.toString());
     }
-
 }