You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2016/07/20 17:15:31 UTC
tika git commit: TIKA-2037 RFC822Parser should wrap the James
InputStream of embedded resources to avoid problems with downstream detection
or extraction
Repository: tika
Updated Branches:
refs/heads/master 3ecdc0cb0 -> 952fb54ed
TIKA-2037 RFC822Parser should wrap the James InputStream of embedded resources to avoid problems with downstream detection or extraction
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/952fb54e
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/952fb54e
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/952fb54e
Branch: refs/heads/master
Commit: 952fb54ed78a2fba07db4653cc674f5641211031
Parents: 3ecdc0c
Author: Nick Burch <ni...@gagravarr.org>
Authored: Wed Jul 20 18:15:25 2016 +0100
Committer: Nick Burch <ni...@gagravarr.org>
Committed: Wed Jul 20 18:15:25 2016 +0100
----------------------------------------------------------------------
CHANGES.txt | 3 +
.../tika/parser/mail/MailContentHandler.java | 13 ++--
.../tika/parser/mail/RFC822ParserTest.java | 68 +++++++++++++++++++-
.../apache/tika/parser/mbox/MboxParserTest.java | 1 -
4 files changed, 77 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index f6191b4..6ba831f 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.14 - ???
+ * Avoid mark/reset issues when extracting or detecting embedded resources
+ in RFC822 emails (TIKA-2037).
+
* Improving accuracy of Tesseract for better extraction of numeric
and alphanumeric text from images (TIKA-2021).
http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 8d16961..6a9bc1b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -16,6 +16,9 @@
*/
package org.apache.tika.parser.mail;
+import static org.apache.tika.utils.DateUtils.MIDDAY;
+import static org.apache.tika.utils.DateUtils.UTC;
+
import java.io.IOException;
import java.io.InputStream;
import java.text.DateFormat;
@@ -44,9 +47,7 @@ import org.apache.james.mime4j.field.LenientFieldParser;
import org.apache.james.mime4j.parser.ContentHandler;
import org.apache.james.mime4j.stream.BodyDescriptor;
import org.apache.james.mime4j.stream.Field;
-import org.apache.james.mime4j.util.ByteSequence;
import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
@@ -58,9 +59,6 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
-import static org.apache.tika.utils.DateUtils.MIDDAY;
-import static org.apache.tika.utils.DateUtils.UTC;
-
/**
* Bridge between mime4j's content handler and the generic Sax content handler
* used by Tika. See
@@ -179,7 +177,10 @@ class MailContentHandler implements ContentHandler {
try {
if (extractor.shouldParseEmbedded(submd)) {
- extractor.parseEmbedded(is, handler, submd, false);
+ // Wrap the InputStream before passing on, as the James provided
+ // one misses many features we might want eg mark/reset
+ TikaInputStream tis = TikaInputStream.get(is);
+ extractor.parseEmbedded(tis, handler, submd, false);
}
} catch (SAXException e) {
throw new MimeException(e);
http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 0d3a2c5..c7fcbfb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -31,24 +31,30 @@ import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import java.io.ByteArrayInputStream;
+import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.text.DateFormat;
import java.text.DateFormatSymbols;
import java.text.SimpleDateFormat;
+import java.util.ArrayList;
import java.util.Date;
+import java.util.List;
import java.util.Locale;
-import java.util.TimeZone;
import org.apache.james.mime4j.stream.MimeConfig;
import org.apache.tika.TikaTest;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
@@ -58,6 +64,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
import org.junit.Test;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class RFC822ParserTest extends TikaTest {
@@ -496,4 +503,63 @@ public class RFC822ParserTest extends TikaTest {
assertEquals("I Urge You to Require Notice of Mercury", m.get(TikaCoreProperties.TITLE));
}
+
+ @Test
+ public void testExtractAttachments() throws Exception {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ Parser p = new RFC822Parser();
+ ParseContext context = new ParseContext();
+
+ try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
+ p.parse(stream, handler, metadata, context);
+ }
+
+ // Check we go the metadata
+ assertEquals("Tika Test <XX...@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
+ assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
+
+ // Try again with attachment detecting and fetching
+ final Detector detector = new DefaultDetector();
+ final Parser extParser = new AutoDetectParser();
+ final List<MediaType> seenTypes = new ArrayList<MediaType>();
+ final List<String> seenText = new ArrayList<String>();
+ EmbeddedDocumentExtractor ext = new EmbeddedDocumentExtractor() {
+ @Override
+ public boolean shouldParseEmbedded(Metadata metadata) {
+ return true;
+ }
+
+ @Override
+ public void parseEmbedded(InputStream stream, ContentHandler handler,
+ Metadata metadata, boolean outputHtml) throws SAXException,
+ IOException {
+ seenTypes.add( detector.detect(stream, metadata) );
+
+ ContentHandler h = new BodyContentHandler();
+ try {
+ extParser.parse(stream, h, metadata, new ParseContext());
+ } catch (TikaException e) {
+ throw new RuntimeException(e);
+ }
+ seenText.add(h.toString());
+ }
+ };
+ context.set(EmbeddedDocumentExtractor.class, ext);
+
+ try (InputStream stream = getStream("test-documents/testEmailWithPNGAtt.eml")) {
+ p.parse(stream, handler, metadata, context);
+ }
+
+ // Check we go the metadata
+ assertEquals("Tika Test <XX...@apache.org>", metadata.get(Metadata.MESSAGE_FROM));
+ assertEquals("Test Attachment Email", metadata.get(TikaCoreProperties.TITLE));
+
+ // Check attachments
+ assertEquals(2, seenTypes.size());
+ assertEquals(2, seenText.size());
+ assertEquals("text/plain", seenTypes.get(0).toString());
+ assertEquals("image/png", seenTypes.get(1).toString());
+ assertEquals("This email has a PNG attachment included in it\n\n", seenText.get(0));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/952fb54e/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
index 6ef803d..94c4e70 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mbox/MboxParserTest.java
@@ -152,5 +152,4 @@ public class MboxParserTest {
assertContains("When a Mapper completes", handler.toString());
}
-
}