You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2015/01/18 23:44:17 UTC

svn commit: r1652866 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/mail/MailContentHandler.java test/java/org/apache/tika/parser/mail/RFC822ParserTest.java test/resources/test-documents/testRFC822_normal_zip

Author: nick
Date: Sun Jan 18 22:44:17 2015
New Revision: 1652866

URL: http://svn.apache.org/r1652866
Log:
TIKA-1028 Refactor the RFC822 parser to setup recursion once per file, not once per attachment, and get it so that a non-encrypted zip attachment is correctly extracted. (Commons Compress currently lacks password protected zip support

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_normal_zip
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1652866&r1=1652865&r2=1652866&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Sun Jan 18 22:44:17 2015
@@ -56,49 +56,47 @@ class MailContentHandler implements Cont
     private boolean strictParsing = false;
 
     private XHTMLContentHandler handler;
-    private ParseContext context;
     private Metadata metadata;
-    private TikaConfig tikaConfig = null;
+    private EmbeddedDocumentExtractor extractor;
 
     private boolean inPart = false;
     
     MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean strictParsing) {
         this.handler = xhtml;
-        this.context = context;
         this.metadata = metadata;
         this.strictParsing = strictParsing;
-    }
-
-    public void body(BodyDescriptor body, InputStream is) throws MimeException,
-            IOException {
-        // Was an EmbeddedDocumentExtractor given to explicitly handle/process
-        //  the attachments in the file?
-        EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class);
+        
+        // Fetch / Build an EmbeddedDocumentExtractor with which
+        //  to handle/process the parts/attachments
+        
+        // Was an EmbeddedDocumentExtractor explicitly supplied?
+        this.extractor = context.get(EmbeddedDocumentExtractor.class);
         
         // If there's no EmbeddedDocumentExtractor, then try using a normal parser
         // This will ensure that the contents are made available to the user, so
         //  the see the text, but without fine-grained control/extraction
         // (This also maintains backward compatibility with older versions!)
-        if (ex == null) {
+        if (this.extractor == null) {
             // If the user gave a parser, use that, if not the default
             Parser parser = context.get(AutoDetectParser.class);
             if (parser == null) {
                parser = context.get(Parser.class);
             }
             if (parser == null) {
-               if (tikaConfig == null) {
-                  tikaConfig = context.get(TikaConfig.class);
-                  if (tikaConfig == null) {
-                     tikaConfig = TikaConfig.getDefaultConfig();
-                  }
-               }
-               parser = tikaConfig.getParser();
+                TikaConfig tikaConfig = context.get(TikaConfig.class);
+                if (tikaConfig == null) {
+                    tikaConfig = TikaConfig.getDefaultConfig();
+                }
+                parser = new AutoDetectParser(tikaConfig.getParser());
             }
             ParseContext ctx = new ParseContext();
             ctx.set(Parser.class, parser);
-            ex = new ParsingEmbeddedDocumentExtractor(ctx);
+            extractor = new ParsingEmbeddedDocumentExtractor(ctx);
         }
+    }
 
+    public void body(BodyDescriptor body, InputStream is) throws MimeException,
+            IOException {
         // use a different metadata object
         // in order to specify the mime type of the
         // sub part without damaging the main metadata
@@ -108,8 +106,8 @@ class MailContentHandler implements Cont
         submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
 
         try {
-            if (ex.shouldParseEmbedded(submd)) {
-                ex.parseEmbedded(is, handler, submd, false);
+            if (extractor.shouldParseEmbedded(submd)) {
+                extractor.parseEmbedded(is, handler, submd, false);
             }
         } catch (SAXException e) {
             throw new MimeException(e);

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1652866&r1=1652865&r2=1652866&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Sun Jan 18 22:44:17 2015
@@ -21,6 +21,7 @@ import static org.junit.Assert.assertFal
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
+import static org.junit.Assume.assumeTrue;
 import static org.mockito.Matchers.any;
 import static org.mockito.Matchers.eq;
 import static org.mockito.Mockito.mock;
@@ -285,9 +286,8 @@ public class RFC822ParserTest extends Ti
         assertContains("This is the Plain Text part", handler.toString());
         assertContains("This is the HTML part", handler.toString());
         
-        // But not the contents of the zip file
-        // TODO Should the filename of the encrypted file in the zip show up or not?
-        //assertNotContained("text.txt", handler.toString());
+        // We won't get the contents of the zip file, but we will get the name
+        assertContains("text.txt", handler.toString());
         assertNotContained("ENCRYPTED ZIP FILES", handler.toString());
         
         // Try again, this time with the password supplied
@@ -309,13 +309,42 @@ public class RFC822ParserTest extends Ti
         // We do get the name of the file in the encrypted zip file
         assertContains("text.txt", handler.toString());
         
-        // But because the RFC822 parser only recurses once, we don't
-        //  get the contents of the text file inside the zip file
-        // TODO Is this correct? Should we see the contents of the encrypted
-        //  zip when a password is given, or not?
-        assertNotContained("TEST DATA FOR TIKA.", handler.toString());
-        assertNotContained("ENCRYPTED ZIP FILES", handler.toString());
-        assertNotContained("TIKA-1028", handler.toString());
+        // TODO Upgrade to a version of Commons Compress with Encryption
+        //  support, then verify we get the contents of the text file
+        //  held within the encrypted zip
+        assumeTrue(false); // No Zip Encryption support yet
+        assertContains("TEST DATA FOR TIKA.", handler.toString());
+        assertContains("ENCRYPTED ZIP FILES", handler.toString());
+        assertContains("TIKA-1028", handler.toString());
+    }
+    
+    /**
+     * Test TIKA-1028 - Ensure we can get the contents of an
+     *  un-encrypted zip file
+     */
+    @Test
+    public void testNormalZipAttachment() throws Exception {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        ParseContext context = new ParseContext();
+        InputStream stream = getStream("test-documents/testRFC822_normal_zip");
+        ContentHandler handler = new BodyContentHandler();
+        parser.parse(stream, handler, metadata, context);
+        
+        // Check we go the metadata
+        assertEquals("Juha Haaga <ju...@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
+        assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
+        
+        // Check we got the message text, for both Plain Text and HTML
+        assertContains("Includes a normal, unencrypted zip file", handler.toString());
+        assertContains("This is the Plain Text part", handler.toString());
+        assertContains("This is the HTML part", handler.toString());
+        
+        // We get both name and contents of the zip file's contents
+        assertContains("text.txt", handler.toString());
+        assertContains("TEST DATA FOR TIKA.", handler.toString());
+        assertContains("This is text inside an unencrypted zip file", handler.toString());
+        assertContains("TIKA-1028", handler.toString());
     }
     
     /**

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_normal_zip
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_normal_zip?rev=1652866&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_normal_zip (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_normal_zip Sun Jan 18 22:44:17 2015
@@ -0,0 +1,61 @@
+Return-Path: <ju...@gmail.com>
+X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on
+	virt0003.codenomicon.com
+X-Spam-Level: 
+X-Spam-Status: No, score=-2.7 required=5.0 tests=BAYES_00,DKIM_SIGNED,
+	DKIM_VALID,DKIM_VALID_AU,FREEMAIL_FROM,HTML_MESSAGE,RCVD_IN_DNSWL_LOW,
+	SPF_PASS autolearn=ham version=3.3.1
+Received: from mail-wg0-f48.google.com (mail-wg0-f48.google.com [74.125.82.48])
+	by codenomicon.com (8.14.4/8.14.4) with ESMTP id t0G7ZmGs002981
+	(version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=OK)
+	for <ju...@codenomicon.com>; Fri, 16 Jan 2015 07:35:54 GMT
+Received: by mail-wg0-f48.google.com with SMTP id l2so19028230wgh.7
+        for <ju...@codenomicon.com>; Thu, 15 Jan 2015 23:35:48 -0800 (PST)
+DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
+        d=gmail.com; s=20120113;
+        h=mime-version:from:date:message-id:subject:to:content-type;
+        bh=wjcer9ESeBUN8rAEahqeDiOHf3wqBHgaeAboZtkw8qM=;
+        b=S3ezJU84qISP1OzMoH+wLMxn7y1JkcJwlUs4Uvfy+QlaSFcDcG66oxqsxniQ4kWmCC
+         QdQ94iztFvcvTAHuJys+jAH1UeVQKgs6T5lINj73nww3CKGh1B78LpnCRkFS93o19Zvt
+         QDSDtB23y9FlLF/dH6okvTIq7jQXNPuaDDqY8yJtp+DcYfW+QiNIGI83QievgQlWMRiV
+         fHuCbeEofTRP/82vHxUDVoZo/hwx8OAjWqPitrCmxU7Mly8lG5No1CHsKWmWd2Q+yxN3
+         tC3Ptbrig720BdBZKYwWSI6xBS4AY+46+utaloq9Hr0qpaDf5e9eXqq0ef0efDgd2kJT
+         gmyw==
+X-Received: by 10.180.39.204 with SMTP id r12mr3350467wik.11.1421393748083;
+ Thu, 15 Jan 2015 23:35:48 -0800 (PST)
+MIME-Version: 1.0
+From: Juha Haaga <ju...@gmail.com>
+Date: Fri, 16 Jan 2015 07:35:46 +0000
+Message-ID: <CA...@mail.gmail.com>
+Subject: Test mail for Tika
+To: Juha Haaga <ju...@codenomicon.com>
+Content-Type: multipart/mixed; boundary=001a11c3649c712d2a050cc0050f
+
+--001a11c3649c712d2a050cc0050f
+Content-Type: multipart/alternative; boundary=001a11c3649c712d27050cc0050d
+
+--001a11c3649c712d27050cc0050d
+Content-Type: text/plain; charset=UTF-8
+
+Includes a normal, unencrypted zip file as attachment.
+This is the Plain Text part
+
+--001a11c3649c712d27050cc0050d
+Content-Type: text/html; charset=UTF-8
+
+Includes &quot;normal&quot;, unencrypted zip file as attachment.<br />
+This is the HTML part
+
+--001a11c3649c712d27050cc0050d--
+--001a11c3649c712d2a050cc0050f
+Content-Type: application/zip; name="test.zip"
+Content-Disposition: attachment; filename="test.zip"
+Content-Transfer-Encoding: base64
+X-Attachment-Id: 14af1ab4e8dbb946bc5
+
+UEsDBBQAAgAIAKt+MEYEXs11bwAAAHgAAAAIABwAdGV4dC50eHRVVAkAA/IzuVSNvrhUdXgLAAEE
+6AMAAAToAwAADcxBCsJADEbh/Zzi9wCKduV2QIXiQrC5QNuJNFqmQ5MB6+kNvOXHo2tHuESKuD2e
+oPYeDyHQJArP+GuQrJIYfUbNnMd1K8YJPyl4ycyuW4NOS50TBkZZl5FVXQwbSD493lXNaeadj/2/
+Px2bc/gDUEsBAh4DFAACAAgAq34wRgRezXVvAAAAeAAAAAgAGAAAAAAAAQAAAKSBAAAAAHRleHQu
+dHh0VVQFAAPyM7lUdXgLAAEE6AMAAAToAwAAUEsFBgAAAAABAAEATgAAALEAAAAAAA==
+--001a11c3649c712d2a050cc0050f--