You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2016/07/26 11:18:10 UTC

[2/5] tika git commit: TIKA-2037 RFC822Parser should wrap the James InputStream of embedded resources to avoid problems with downstream detection or extraction

TIKA-2037 RFC822Parser should wrap the James InputStream of embedded resources to avoid problems with downstream detection or extraction


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/31374a39
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/31374a39
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/31374a39

Branch: refs/heads/2.x
Commit: 31374a39bae03bfc260f73662c133467637193f1
Parents: d6ce10b
Author: Nick Burch <ni...@gagravarr.org>
Authored: Wed Jul 20 18:15:25 2016 +0100
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Tue Jul 26 12:05:47 2016 +0100

----------------------------------------------------------------------
 CHANGES.txt                                     |  3 +
 .../apache/tika/parser/mbox/MboxParserTest.java |  1 -
 .../tika/parser/mail/MailContentHandler.java    | 11 ++--
 .../tika/parser/mail/RFC822ParserTest.java      | 68 +++++++++++++++++++-
 4 files changed, 77 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/31374a39/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index da9353d..a4fca50 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -22,6 +22,9 @@ Release 1.14 - ???
   * Maintain more significant digits in cells of "General" format
     in XLS and XLSX (TIKA-2025).
 
+  * Avoid mark/reset issues when extracting or detecting embedded resources
+    in RFC822 emails (TIKA-2037).
+
   * Improve extraction of embedded documents for PPT, PPTX and XLSX
     (TIKA-2026).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/31374a39/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
index 6ef803d..94c4e70 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
@@ -152,5 +152,4 @@ public class MboxParserTest {
 
         assertContains("When a Mapper completes", handler.toString());
     }
-
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/31374a39/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 2c8942e..9c16c8c 100644
--- a/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parser-modules/tika-parser-web-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -16,6 +16,9 @@
  */
 package org.apache.tika.parser.mail;
 
+import static org.apache.tika.utils.DateUtils.MIDDAY;
+import static org.apache.tika.utils.DateUtils.UTC;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.text.DateFormat;
@@ -55,9 +58,6 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
-import static org.apache.tika.utils.DateUtils.MIDDAY;
-import static org.apache.tika.utils.DateUtils.UTC;
-
 /**
  * Bridge between mime4j's content handler and the generic Sax content handler
  * used by Tika. See
@@ -176,7 +176,10 @@ class MailContentHandler implements ContentHandler {
 
         try {
             if (extractor.shouldParseEmbedded(submd)) {
-                extractor.parseEmbedded(is, handler, submd, false);
+                // Wrap the InputStream before passing on, as the James provided
+                //  one misses many features we might want eg mark/reset
+                TikaInputStream tis = TikaInputStream.get(is);
+                extractor.parseEmbedded(tis, handler, submd, false);
             }
         } catch (SAXException e) {
             throw new MimeException(e);

http://git-wip-us.apache.org/repos/asf/tika/blob/31374a39/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index ee9a98b..3be1edd 100644
--- a/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parser-modules/tika-parser-web-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -27,24 +27,30 @@ import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
 
 import java.io.ByteArrayInputStream;
-import java.io.File;
+import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.text.DateFormat;
 import java.text.DateFormatSymbols;
 import java.text.SimpleDateFormat;
+import java.util.ArrayList;
 import java.util.Date;
+import java.util.List;
 import java.util.Locale;
 
 import org.apache.james.mime4j.stream.MimeConfig;
 import org.apache.tika.TikaTest;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParserContainerExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.PasswordProvider;
@@ -53,6 +59,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
 import org.junit.Test;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
 public class RFC822ParserTest extends TikaTest {
@@ -482,4 +489,63 @@ public class RFC822ParserTest extends TikaTest {
         p.parse(TikaInputStream.get(s.getBytes(StandardCharsets.UTF_8)), new DefaultHandler(), m, new ParseContext());
         assertEquals("I Urge You to Require Notice of Mercury", m.get(TikaCoreProperties.TITLE));
     }
+
+    @Test
+    public void testExtractAttachments() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        Parser p = new RFC822Parser();
+        ParseContext context = new ParseContext();
+
+        try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
+            p.parse(stream, handler, metadata, context);
+        }
+
+        // Check we go the metadata
+        assertEquals("Tika Test <XX...@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
+        assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
+        
+        // Try again with attachment detecting and fetching
+        final Detector detector = new DefaultDetector();
+        final Parser extParser = new AutoDetectParser();
+        final List<MediaType> seenTypes = new ArrayList<MediaType>();
+        final List<String> seenText = new ArrayList<String>();
+        EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() {
+            @Override
+            public boolean shouldParseEmbedded(Metadata metadata) {
+                return true;
+            }
+            
+            @Override
+            public void parseEmbedded(InputStream stream, ContentHandler handler,
+                    Metadata metadata, boolean outputHtml) throws SAXException,
+                    IOException {
+                seenTypes.add( detector.detect(stream, metadata) );
+                
+                ContentHandler h = new BodyContentHandler();
+                try {
+                    extParser.parse(stream, h, metadata, new ParseContext());
+                } catch (TikaException e) {
+                    throw new RuntimeException(e);
+                }
+                seenText.add(h.toString());
+            }
+        };
+        context.set(EmbeddedDocumentExtractor.class, ext);
+
+        try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
+            p.parse(stream, handler, metadata, context);
+        }
+        
+        // Check we go the metadata
+        assertEquals("Tika Test <XX...@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
+        assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
+        
+        // Check attachments
+        assertEquals(2, seenTypes.size());
+        assertEquals(2, seenText.size());
+        assertEquals("text/plain", seenTypes.get(0).toString());
+        assertEquals("image/png", seenTypes.get(1).toString());
+        assertEquals("This email has a PNG attachment included in it\n\n", seenText.get(0));
+    }
 }