You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/18 23:44:17 UTC
svn commit: r1652866 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/mail/MailContentHandler.java
test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
test/resources/test-documents/testRFC822_normal_zip
Author: nick
Date: Sun Jan 18 22:44:17 2015
New Revision: 1652866
URL: http://svn.apache.org/r1652866
Log:
TIKA-1028 Refactor the RFC822 parser to setup recursion once per file, not once per attachment, and get it so that a non-encrypted zip attachment is correctly extracted. (Commons Compress currently lacks password protected zip support
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_normal_zip
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1652866&r1=1652865&r2=1652866&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Sun Jan 18 22:44:17 2015
@@ -56,49 +56,47 @@ class MailContentHandler implements Cont
private boolean strictParsing = false;
private XHTMLContentHandler handler;
- private ParseContext context;
private Metadata metadata;
- private TikaConfig tikaConfig = null;
+ private EmbeddedDocumentExtractor extractor;
private boolean inPart = false;
MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) {
this.handler = xhtml;
- this.context = context;
this.metadata = metadata;
this.strictParsing = strictParsing;
- }
-
- public void body(BodyDescriptor body, InputStream is) throws MimeException,
- IOException {
- // Was an EmbeddedDocumentExtractor given to explicitly handle/process
- // the attachments in the file?
- EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+
+ // Fetch / Build an EmbeddedDocumentExtractor with which
+ // to handle/process the parts/attachments
+
+ // Was an EmbeddedDocumentExtractor explicitly supplied?
+ this.extractor = context.get(EmbeddedDocumentExtractor.class);
// If there's no EmbeddedDocumentExtractor, then try using a normal parser
// This will ensure that the contents are made available to the user, so
// the see the text, but without fine-grained control/extraction
// (This also maintains backward compatibility with older versions!)
- if (ex == null) {
+ if (this.extractor == null) {
// If the user gave a parser, use that, if not the default
Parser parser = context.get(AutoDetectParser.class);
if (parser == null) {
parser = context.get(Parser.class);
}
if (parser == null) {
- if (tikaConfig == null) {
- tikaConfig = context.get(TikaConfig.class);
- if (tikaConfig == null) {
- tikaConfig = TikaConfig.getDefaultConfig();
- }
- }
- parser = tikaConfig.getParser();
+ TikaConfig tikaConfig = context.get(TikaConfig.class);
+ if (tikaConfig == null) {
+ tikaConfig = TikaConfig.getDefaultConfig();
+ }
+ parser = new AutoDetectParser(tikaConfig.getParser());
}
ParseContext ctx = new ParseContext();
ctx.set(Parser.class, parser);
- ex = new ParsingEmbeddedDocumentExtractor(ctx);
+ extractor = new ParsingEmbeddedDocumentExtractor(ctx);
}
+ }
+ public void body(BodyDescriptor body, InputStream is) throws MimeException,
+ IOException {
// use a different metadata object
// in order to specify the mime type of the
// sub part without damaging the main metadata
@@ -108,8 +106,8 @@ class MailContentHandler implements Cont
submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
try {
- if (ex.shouldParseEmbedded(submd)) {
- ex.parseEmbedded(is, handler, submd, false);
+ if (extractor.shouldParseEmbedded(submd)) {
+ extractor.parseEmbedded(is, handler, submd, false);
}
} catch (SAXException e) {
throw new MimeException(e);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1652866&r1=1652865&r2=1652866&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Sun Jan 18 22:44:17 2015
@@ -21,6 +21,7 @@ import static org.junit.Assert.assertFal
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.mock;
@@ -285,9 +286,8 @@ public class RFC822ParserTest extends Ti
assertContains("This is the Plain Text part", handler.toString());
assertContains("This is the HTML part", handler.toString());
- // But not the contents of the zip file
- // TODO Should the filename of the encrypted file in the zip show up or not?
- //assertNotContained("text.txt", handler.toString());
+ // We won't get the contents of the zip file, but we will get the name
+ assertContains("text.txt", handler.toString());
assertNotContained("ENCRYPTED ZIP FILES", handler.toString());
// Try again, this time with the password supplied
@@ -309,13 +309,42 @@ public class RFC822ParserTest extends Ti
// We do get the name of the file in the encrypted zip file
assertContains("text.txt", handler.toString());
- // But because the RFC822 parser only recurses once, we don't
- // get the contents of the text file inside the zip file
- // TODO Is this correct? Should we see the contents of the encrypted
- // zip when a password is given, or not?
- assertNotContained("TEST DATA FOR TIKA.", handler.toString());
- assertNotContained("ENCRYPTED ZIP FILES", handler.toString());
- assertNotContained("TIKA-1028", handler.toString());
+ // TODO Upgrade to a version of Commons Compress with Encryption
+ // support, then verify we get the contents of the text file
+ // held within the encrypted zip
+ assumeTrue(false); // No Zip Encryption support yet
+ assertContains("TEST DATA FOR TIKA.", handler.toString());
+ assertContains("ENCRYPTED ZIP FILES", handler.toString());
+ assertContains("TIKA-1028", handler.toString());
+ }
+
+ /**
+ * Test TIKA-1028 - Ensure we can get the contents of an
+ * un-encrypted zip file
+ */
+ @Test
+ public void testNormalZipAttachment() throws Exception {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ InputStream stream = getStream("test-documents/testRFC822_normal_zip");
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(stream, handler, metadata, context);
+
+ // Check we go the metadata
+ assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
+ assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
+
+ // Check we got the message text, for both Plain Text and HTML
+ assertContains("Includes a normal, unencrypted zip file", handler.toString());
+ assertContains("This is the Plain Text part", handler.toString());
+ assertContains("This is the HTML part", handler.toString());
+
+ // We get both name and contents of the zip file's contents
+ assertContains("text.txt", handler.toString());
+ assertContains("TEST DATA FOR TIKA.", handler.toString());
+ assertContains("This is text inside an unencrypted zip file", handler.toString());
+ assertContains("TIKA-1028", handler.toString());
}
/**
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_normal_zip
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_normal_zip?rev=1652866&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_normal_zip (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_normal_zip Sun Jan 18 22:44:17 2015
@@ -0,0 +1,61 @@
+Return-Path: <ju...@gmail.com>
+X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on
+ virt0003.codenomicon.com
+X-Spam-Level:
+X-Spam-Status: No, score=-2.7 required=5.0 tests=BAYES_00,DKIM_SIGNED,
+ DKIM_VALID,DKIM_VALID_AU,FREEMAIL_FROM,HTML_MESSAGE,RCVD_IN_DNSWL_LOW,
+ SPF_PASS autolearn=ham version=3.3.1
+Received: from mail-wg0-f48.google.com (mail-wg0-f48.google.com [74.125.82.48])
+ by codenomicon.com (8.14.4/8.14.4) with ESMTP id t0G7ZmGs002981
+ (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK)
+ for <ju...@codenomicon.com>; Fri, 16 Jan 2015 07:35:54 GMT
+Received: by mail-wg0-f48.google.com with SMTP id l2so19028230wgh.7
+ for <ju...@codenomicon.com>; Thu, 15 Jan 2015 23:35:48 -0800 (PST)
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
+ d=gmail.com; s=20120113;
+ h=mime-version:from:date:message-id:subject:to:content-type;
+ bh=wjcer9ESeBUN8rAEahqeDiOHf3wqBHgaeAboZtkw8qM=;
+ b=S3ezJU84qISP1OzMoH+wLMxn7y1JkcJwlUs4Uvfy+QlaSFcDcG66oxqsxniQ4kWmCC
+ QdQ94iztFvcvTAHuJys+jAH1UeVQKgs6T5lINj73nww3CKGh1B78LpnCRkFS93o19Zvt
+ QDSDtB23y9FlLF/dH6okvTIq7jQXNPuaDDqY8yJtp+DcYfW+QiNIGI83QievgQlWMRiV
+ fHuCbeEofTRP/82vHxUDVoZo/hwx8OAjWqPitrCmxU7Mly8lG5No1CHsKWmWd2Q+yxN3
+ tC3Ptbrig720BdBZKYwWSI6xBS4AY+46+utaloq9Hr0qpaDf5e9eXqq0ef0efDgd2kJT
+ gmyw==
+X-Received: by 10.180.39.204 with SMTP id r12mr3350467wik.11.1421393748083;
+ Thu, 15 Jan 2015 23:35:48 -0800 (PST)
+MIME-Version: 1.0
+From: Juha Haaga <ju...@gmail.com>
+Date: Fri, 16 Jan 2015 07:35:46 +0000
+Message-ID: <CA...@mail.gmail.com>
+Subject: Test mail for Tika
+To: Juha Haaga <ju...@codenomicon.com>
+Content-Type: multipart/mixed; boundary=001a11c3649c712d2a050cc0050f
+
+--001a11c3649c712d2a050cc0050f
+Content-Type: multipart/alternative; boundary=001a11c3649c712d27050cc0050d
+
+--001a11c3649c712d27050cc0050d
+Content-Type: text/plain; charset=UTF-8
+
+Includes a normal, unencrypted zip file as attachment.
+This is the Plain Text part
+
+--001a11c3649c712d27050cc0050d
+Content-Type: text/html; charset=UTF-8
+
+Includes "normal", unencrypted zip file as attachment.<br />
+This is the HTML part
+
+--001a11c3649c712d27050cc0050d--
+--001a11c3649c712d2a050cc0050f
+Content-Type: application/zip; name="test.zip"
+Content-Disposition: attachment; filename="test.zip"
+Content-Transfer-Encoding: base64
+X-Attachment-Id: 14af1ab4e8dbb946bc5
+
+UEsDBBQAAgAIAKt+MEYEXs11bwAAAHgAAAAIABwAdGV4dC50eHRVVAkAA/IzuVSNvrhUdXgLAAEE
+6AMAAAToAwAADcxBCsJADEbh/Zzi9wCKduV2QIXiQrC5QNuJNFqmQ5MB6+kNvOXHo2tHuESKuD2e
+oPYeDyHQJArP+GuQrJIYfUbNnMd1K8YJPyl4ycyuW4NOS50TBkZZl5FVXQwbSD493lXNaeadj/2/
+Px2bc/gDUEsBAh4DFAACAAgAq34wRgRezXVvAAAAeAAAAAgAGAAAAAAAAQAAAKSBAAAAAHRleHQu
+dHh0VVQFAAPyM7lUdXgLAAEE6AMAAAToAwAAUEsFBgAAAAABAAEATgAAALEAAAAAAA==
+--001a11c3649c712d2a050cc0050f--