You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/12/03 14:39:50 UTC

svn commit: r1041811 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/mail/ test/java/org/apache/tika/parser/mail/ test/resources/test-documents/

Author: jukka
Date: Fri Dec  3 13:39:49 2010
New Revision: 1041811

URL: http://svn.apache.org/viewvc?rev=1041811&view=rev
Log:
TIKA-461: RFC822 messages not parsed

Patch by Benjamin Douglas.

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822   (contents, props changed)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart   (contents, props changed)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64   (contents, props changed)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders   (contents, props changed)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted   (contents, props changed)

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java?rev=1041811&r1=1041810&r2=1041811&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java Fri Dec  3 13:39:49 2010
@@ -21,6 +21,10 @@ import java.io.InputStream;
 
 import org.apache.james.mime4j.MimeException;
 import org.apache.james.mime4j.descriptor.BodyDescriptor;
+import org.apache.james.mime4j.field.AbstractField;
+import org.apache.james.mime4j.field.MailboxListField;
+import org.apache.james.mime4j.field.UnstructuredField;
+import org.apache.james.mime4j.field.address.MailboxList;
 import org.apache.james.mime4j.parser.ContentHandler;
 import org.apache.james.mime4j.parser.Field;
 import org.apache.tika.exception.TikaException;
@@ -120,12 +124,15 @@ class MailContentHandler implements Cont
             return;
         // TODO add metadata to the parts later
         String fieldname = field.getName();
-        // TODO value could be parsed and/or encoded
-        String value = field.getBody();
         if (fieldname.equalsIgnoreCase("From")) {
-            metadata.add(Metadata.AUTHOR, value);
+        	MailboxListField fromField = (MailboxListField) AbstractField.parse(field.getRaw());
+        	MailboxList mailboxList = fromField.getMailboxList();
+        	for (int i = 0; i < mailboxList.size(); ++i) {
+                metadata.add(Metadata.AUTHOR, mailboxList.get(i).getDisplayString());        		
+        	}
         } else if (fieldname.equalsIgnoreCase("Subject")) {
-            metadata.add(Metadata.SUBJECT, value);
+        	UnstructuredField subjectField = (UnstructuredField) AbstractField.parse(field.getRaw());
+            metadata.add(Metadata.SUBJECT, subjectField.getValue());
         }
     }
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1041811&r1=1041810&r2=1041811&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Fri Dec  3 13:39:49 2010
@@ -55,6 +55,7 @@ public class RFC822Parser implements Par
 
         MailContentHandler mch = new MailContentHandler(xhtml, metadata);
         parser.setContentHandler(mch);
+        parser.setContentDecoding(true);
         try {
             parser.parse(stream);
         } catch (MimeException e) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1041811&r1=1041810&r2=1041811&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Fri Dec  3 13:39:49 2010
@@ -20,6 +20,7 @@ import static org.mockito.Matchers.any;
 import static org.mockito.Matchers.eq;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
 
 import java.io.InputStream;
@@ -29,6 +30,7 @@ import junit.framework.TestCase;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
@@ -52,53 +54,86 @@ public class RFC822ParserTest extends Te
             verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
             verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
             verify(handler).endDocument();
-            //note no leading spaces
-            // TODO: Enable when the parser trims header values
-            // assertEquals("\"Julien Nioche (JIRA)\" <ji...@apache.org>", metadata.get(Metadata.AUTHOR));
-            // assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(Metadata.SUBJECT));
+            //note no leading spaces, and no quotes
+            assertEquals("Julien Nioche (JIRA) <ji...@apache.org>", metadata.get(Metadata.AUTHOR));
+            assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(Metadata.SUBJECT));
         } catch (Exception e) {
             fail("Exception thrown: " + e.getMessage());
         }
     }
 
-    // TODO: enable this test when the parser supports quoted printable
-    public void disabledTestQuotedPrintable() {
+    public void testMultipart() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822-multipart");
+        ContentHandler handler = mock(XHTMLContentHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            verify(handler).startDocument();
+            //4 body-part divs -- two outer bodies and two inner bodies
+            verify(handler, times(4)).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
+            verify(handler, times(4)).endElement(XHTMLContentHandler.XHTML, "div", "div");
+            //5 paragraph elements, 4 for body-parts and 1 for encompassing message
+            verify(handler, times(5)).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+            verify(handler, times(5)).endElement(XHTMLContentHandler.XHTML, "p", "p");
+            verify(handler).endDocument();
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+        
+        //repeat, this time looking at content
+        parser = new RFC822Parser();
+        metadata = new Metadata();
+        stream = getStream("test-documents/testRFC822-multipart");
+        handler = new BodyContentHandler();
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
+            String bodyText = handler.toString();
+            assertTrue(bodyText.contains("body 1"));
+            assertTrue(bodyText.contains("body 2"));
+            assertFalse(bodyText.contains("R0lGODlhNgE8AMQAA")); //part of encoded gif
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    public void testQuotedPrintable() {
         Parser parser = new RFC822Parser();
         Metadata metadata = new Metadata();
         InputStream stream = getStream("test-documents/testRFC822_quoted");
-        ContentHandler handler = mock(DefaultHandler.class);
+        ContentHandler handler = new BodyContentHandler();
 
         try {
             parser.parse(stream, handler, metadata, new ParseContext());
             //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
-            verify(handler).characters(new String("D\u00FCsseldorf has non-ascii. "
-            	+ "Lines can be split like this. Spaces at the end of a line \r\n"
-            	+ "must be encoded.\r\n").toCharArray(), 0, 104);
+            String bodyText = handler.toString();
+            assertTrue(bodyText.contains("D\u00FCsseldorf has non-ascii."));
+            assertTrue(bodyText.contains("Lines can be split like this."));
+            assertTrue(bodyText.contains("Spaces at the end of a line \r\nmust be encoded.\r\n"));
+            assertFalse(bodyText.contains("=")); //there should be no escape sequences
         } catch (Exception e) {
             fail("Exception thrown: " + e.getMessage());
         }
     }
 
-    // TODO: enable this test when the parser supports base64
-    public void disabledTestBase64() {
+    public void testBase64() {
         Parser parser = new RFC822Parser();
         Metadata metadata = new Metadata();
         InputStream stream = getStream("test-documents/testRFC822_base64");
-        ContentHandler handler = mock(DefaultHandler.class);
+        ContentHandler handler = new BodyContentHandler();
 
         try {
             parser.parse(stream, handler, metadata, new ParseContext());
             //tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
-            verify(handler).characters(new String(
-            	"Here is some text, with international characters, voil\u00E0!\r\n"
-            	).toCharArray(), 0, 58);
+            assertTrue(handler.toString().contains("Here is some text, with international characters, voil\u00E0!"));
         } catch (Exception e) {
             fail("Exception thrown: " + e.getMessage());
         }
     }
 
-    // TODO: enable this test when the parser supports i18n headers
-    public void disabledTestI18NHeaders() {
+    public void testI18NHeaders() {
         Parser parser = new RFC822Parser();
         Metadata metadata = new Metadata();
         InputStream stream = getStream("test-documents/testRFC822_i18nheaders");

Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822?rev=1041811&r1=1041810&r2=1041811&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822 (original)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822 Fri Dec  3 13:39:49 2010
@@ -1,41 +1,41 @@
-From: "Julien Nioche (JIRA)" <ji...@apache.org>
-To: dev@tika.apache.org
-Subject: [jira] Commented: (TIKA-461) RFC822 messages not parsed
-Reply-To: dev@tika.apache.org
-Delivered-To: mailing list dev@tika.apache.org
-Date: Mon, 6 Sep 2010 05:25:34 -0400 (EDT)
-In-Reply-To: <60...@thor>
-MIME-Version: 1.0
-Content-Type: text/plain; charset=utf-8
-Content-Transfer-Encoding: 7bit
-X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
-X-Virus-Checked: Checked by ClamAV on apache.org
-
-
-    [ https://issues.apache.org/jira/browse/TIKA-461?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12906468#action_12906468 ] 
-
-Julien Nioche commented on TIKA-461:
-------------------------------------
-
-I'll have a look at mime4j and try to use it in Tika
-
-> RFC822 messages not parsed
-> --------------------------
->
->                 Key: TIKA-461
->                 URL: https://issues.apache.org/jira/browse/TIKA-461
->             Project: Tika
->          Issue Type: Bug
->          Components: parser
->    Affects Versions: 0.7
->            Reporter: Joshua Turner
->            Assignee: Julien Nioche
->
-> Presented with an RFC822 message exported from Thunderbird, AutodetectParser produces an empty body, and a Metadata containing only one key-value pair: "Content-Type=message/rfc822". Directly calling MboxParser likewise gives an empty body, but with two metadata pairs: "Content-Encoding=us-ascii Content-Type=application/mbox".
-> A quick peek at the source of MboxParser shows that the implementation is pretty naive. If the wiring can be sorted out, something like Apache James' mime4j might be a better bet.
-
--- 
-This message is automatically generated by JIRA.
--
-You can reply to this email to add a comment to the issue online.
-
+From: "Julien Nioche (JIRA)" <ji...@apache.org>
+To: dev@tika.apache.org
+Subject: [jira] Commented: (TIKA-461) RFC822 messages not parsed
+Reply-To: dev@tika.apache.org
+Delivered-To: mailing list dev@tika.apache.org
+Date: Mon, 6 Sep 2010 05:25:34 -0400 (EDT)
+In-Reply-To: <60...@thor>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+
+    [ https://issues.apache.org/jira/browse/TIKA-461?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12906468#action_12906468 ] 
+
+Julien Nioche commented on TIKA-461:
+------------------------------------
+
+I'll have a look at mime4j and try to use it in Tika
+
+> RFC822 messages not parsed
+> --------------------------
+>
+>                 Key: TIKA-461
+>                 URL: https://issues.apache.org/jira/browse/TIKA-461
+>             Project: Tika
+>          Issue Type: Bug
+>          Components: parser
+>    Affects Versions: 0.7
+>            Reporter: Joshua Turner
+>            Assignee: Julien Nioche
+>
+> Presented with an RFC822 message exported from Thunderbird, AutodetectParser produces an empty body, and a Metadata containing only one key-value pair: "Content-Type=message/rfc822". Directly calling MboxParser likewise gives an empty body, but with two metadata pairs: "Content-Encoding=us-ascii Content-Type=application/mbox".
+> A quick peek at the source of MboxParser shows that the implementation is pretty naive. If the wiring can be sorted out, something like Apache James' mime4j might be a better bet.
+
+-- 
+This message is automatically generated by JIRA.
+-
+You can reply to this email to add a comment to the issue online.
+

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822
------------------------------------------------------------------------------
    svn:eol-style = CRLF

Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart?rev=1041811&r1=1041810&r2=1041811&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart (original)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart Fri Dec  3 13:39:49 2010
@@ -1,110 +1,111 @@
-                                                                                                                                                                                                                                                               
-MIME-Version: 1.0
-Sender: digitalpebble@googlemail.com
-Received: by 10.231.31.200 with HTTP; Mon, 27 Sep 2010 06:29:16 -0700 (PDT)
-Date: Mon, 27 Sep 2010 14:29:16 +0100
-Delivered-To: digitalpebble@gmail.com
-X-Google-Sender-Auth: it4o8JYLKcQ5bHJbTpqDhZv46vk
-Message-ID: <AA...@mail.gmail.com>
-Subject: Test Multi Part Message
-From: DigitalPebble <ju...@digitalpebble.com>
-To: lists.digitalpebble@gmail.com
-Content-Type: multipart/mixed; boundary=0016e64606800312ee04913db790
-
---0016e64606800312ee04913db790
-Content-Type: multipart/alternative; boundary=0016e64606800312ea04913db78e
-
---0016e64606800312ea04913db78e
-Content-Type: text/plain; charset=UTF-8
-
-This is a test for parsing multi-part mails. With some funky HTML code an a
-picture attached.
-
--- 
-**
-*
-Open Source Solutions for Text Engineering
-
-http://digitalpebble.blogspot.com
-http://www.digitalpebble.com*
-
---0016e64606800312ea04913db78e
-Content-Type: text/html; charset=UTF-8
-Content-Transfer-Encoding: quoted-printable
-
-This is a test for parsing multi-part mails. With<span style=3D"color: rgb(=
-204, 0, 0);"> some funky HTML code</span> an a picture attached.<br clear=
-=3D"all"><br>-- <br><font face=3D"arial, helvetica, sans-serif"><b><span st=
-yle=3D"font-family: arial; font-weight: normal;"><b style=3D"color: rgb(0, =
-0, 0); font-family: arial,helvetica,sans-serif;"><img src=3D"http://digital=
-pebble.com/img/logo.gif" height=3D"38" width=3D"200"></b></span></b></font>=
-<div>
-<font face=3D"arial, helvetica, sans-serif"><b><span style=3D"font-family: =
-arial; font-weight: normal;"><b style=3D"color: rgb(0, 0, 0); font-family: =
-arial,helvetica,sans-serif;"><span style=3D"font-size: x-small;">=C2=A0</sp=
-an><br style=3D"font-family: arial,helvetica,sans-serif;">
-</b><span style=3D"color: rgb(102, 102, 102); font-family: arial,helvetica,=
-sans-serif;"><span style=3D"color: rgb(51, 51, 51);">Open Source Solutions =
-for Text Engineering</span><br>
-<span style=3D"font-size: x-small;">=C2=A0</span><br>
-</span></span><span style=3D"color: rgb(102, 102, 102);"><span style=3D"fon=
-t-weight: normal;"><a href=3D"http://digitalpebble.blogspot.com" target=3D"=
-_blank">http://digitalpebble.blogspot.com</a></span></span><span style=3D"f=
-ont-weight: normal;"><br style=3D"color: rgb(102, 102, 102);">
-</span>
-<span style=3D"color: rgb(102, 102, 102);"><span style=3D"font-weight: norm=
-al;"><a href=3D"http://www.digitalpebble.com" target=3D"_blank">http://www.=
-digitalpebble.com</a></span></span></b></font></div><br>
-
---0016e64606800312ea04913db78e--
---0016e64606800312ee04913db790
-Content-Type: image/gif; name="logo.gif"
-Content-Disposition: attachment; filename="logo.gif"
-Content-Transfer-Encoding: base64
-X-Attachment-Id: f_geldjvqq0
-
-R0lGODlhNgE8AMQAALxlVPv19JmZmaysrNnZ2cR4acXFxaWlpd2yqeXl5dWelObFv82Lfu7Y1Ozs
-7L+/v////8BuXtGVibKysszMzPLi39/f3/fs6tmonsiBdOrPyeK7tP4BAgAAAAAAAAAAACH5BAUU
-ABwALAAAAAA2ATwAAAX/ICSOZGmeaKqubOu+cCzPdG3feK7vfO//wKBwSCwaj8ikcslsOp/QqHRK
-rVqv2KwW2mj0KprfZbEtm7eBBoIBACByjY0kAmDsKgtFoY0lTP4TDiYOgBMERISAh1IGhScLCJES
-DJSVEpEaFUxtnG44bJ12OQidfFcUAqkCCSYJqgIURK6qsSoGrEUTryegpb6cEgsBSL5vN71tojik
-pVioqrgks6m1QtOwKQSp0UO6qry/4ZwRGMNFxZ+lyjfMnc6v3CLX1UHzJ4TQRt6p4OL+EZqIoDum
-Tkc7Tu/ylbA3hGGJZ9v07TKBzF84L0MG2qi4zsZBU1YgrjIRgIJJCvF+/5Q8mRKCyJY/9gnoZ1Fc
-hAsZSxnbWHCUr4QRy7yU+I2iry5dNCCYI05Bzk47a3A0+PMUPDNDc00sUREAigV7fpkDolFqz2VV
-Q14VupaITJoIUQSQ8ItMkLI0pvpsZlXhlqxut5LoqiJA2E4YhOCdoRct3yUWKAAyQGAsYGkJMo9V
-QaDRH8qbM2s+ITpB6MwPXqEUXTjyg0IGULZ4a/QxCgy+OpoYg6ESAwTCWqC7sEFBJQwbArKYGmBB
-b0oKEGB08dHriwrFLSEIE0PkK1jDLstTXTj1d1rDrqV0COG8exQEBrj/TmGzCdpc06JokDtFg66d
-SDAdCr4ooIA4BSBgX/9tnGSAAB2/RKAATixU5wJYNk3YggXzvTJASW2NwF4JHHaYygAOqNcKeSOY
-WBQJDhzg4isHCIICfoPpdwJ/Z5FwAYC/SLDgCDVFyB0KQFoUFQoWrtDAYUquoM2MAhxgQIjj0QIf
-ldRgCQF7VJbgAJffDQkBjiMQpgKPoZhQAYRFAgAQgXH6khiSdapjpghNprBAngBkYOY1qUywGgHm
-ubceiwud98BqFCR63qJajnCSTI+eREIA5w1AgQWiRerojYKlqaMJ1S3ZAJx1zmkCoKXcySCsGajQ
-5wkbwBrokDKqckBKAcjkl4iMktBrKr+2cqyXI4og3giSjnSPfK8Miab/CGqmAGUbdonwZjgZULJt
-gwsiSEkG/hyZY4SV+CMrqqeO8Gc4lbDaiVMP0bhntNJKU6yzHu5JLbP/AjzsCJx6WcJ3LV0LQba4
-/kKhCOiWEoGCPlbnSQm/aEiCBnT5EsGCXTEwIAQV4PbLySPc6qO9ABTQrbchl3LysjamMHBQ/lZK
-wnc5n5AwwT6T8KxLr0zAwpVFj+AwxJtqXGvLIitXwredjMxxrCloADO+6zaYQgUVd1IAk/E+7Iug
-X/U3QompPMACoZRSQ+IrBrDgXd3YmHC0SBawAHffJTzty8Qoa4ABzNySALPVJszbyQZbQ6WC5OPY
-19gJWHfCMgQue+tL/wF7QnBgKcoxnUrQKSzLNz0QSFp6i0Tb7bfCIgSQAOwoNHtmqdjqysk6mLcB
-trY9QrBYfqXMrHabthZ4Quim++L8bhrt7ILq/RLb9LEHuCDT6yccLYPvhgt/Ewk1cwL59Dou/7H0
-YSezQgCjw2/bCPZG0EJjSXPB4Mj3M1XIrQXP8p35XOAARNHnBOmDVQEgt62zsUADvrCa/EjgNlNB
-TwVQ49OpLkC/FahsHLl7xQFZMKaDfalYvitf7QhnNNz1jgCRElbBfvei+tVJSK8S3lE4qBMWjIt5
-w2PB6TwHr/1BgE1CjEsMTzBDekyxUS5UoA1H4EAy8Y6H/JhVkRTwOf8oRhEAA9qgB93hQ900kYkl
-CJ3GhJcl27Wgij2z4wpUhEU9WmqL3uHSFyPYIHX8Rl0mMGMUNVjEFUBtc2iz2RvjQoI5wsp/L2ya
-CvDoPT+mgI95pOEfXTiCnc3HUCnaISFBEgNFCrFynFgSXFj5vCSuoDosk+MZbXnFhWURhjuU4S81
-iTRSQsCUhXqUaUI5SODVkpKt3GUbpkakRqqgbG3A5BrtZ8IMTpKWoJMmAGSlQhcMrXt1pOE5V7iC
-BO7wb995AEzS2cwebhOcLiChNW2gRhF0MHgfTEEIw+nE4qHxBh5yASg7KcqEtkBSBMwXKb+zCM6o
-0pkDdcE/a9DPCtj/CYncVIG9qFnJUylSljGQyewMxjOGwk6ld+SkRFuaTnamQCT1DCNI8ekCbMpp
-pSzoZ3WcB8lElrCkTsRf8mQgkoqqYGcRZakAnIqCwaEzk54sJk21KoDAsWBnOZ2JGK0zgxPGEgdq
-NIwvNLfUErSvDYgUoRMh4NOD1uCc4dsjw1akSX2tQFhRHeVWFziCFhJzlWSVgUct9j6SqECNS+TE
-8QBqy7aVwoLfTOwIclUKtt0vBZLK21P32kdRxq6cKeDeVrXoQpx+9YH3wWjaXNCVCMS1kpg9gS8k
-IJfIcoJlFSEdCji7T6R24n0BgFkGGps7CUwWRhQF7aT4mtV4Dkm1/6t9Z8GukVfpxhOCsp3rC1xZ
-hwUo5wINWFzj6NRZDZjjAopj3HOfmc0FUCgNCBhXzFJQHcpl1ngamBgYloi4Eqj2AfZJADKv6rsp
-+YoCNgoAAZYlUxJYNSXWJQ2FCwVee1IWmmXVVW5hKcEhJalIzCWo2VBQ1zGO9juGooABdgZV6ppW
-BPzqUI1L+0XDnsccgTQAqBLQmWXtTGmx9TB9NTsD38bpev5UH3NPbBEojwCDR0VYi2tS4BIsuFMD
-tPEXcUwlHFbYWPOJhg47hGDtJVmnPmRyk2nFXkAtV6C6su396LWgAGzZXe100QR0d2YSbdhXrBAJ
-67B6YwhYtWg5Pv/PIgD3ZrHuVM4z0IB+6dXlKGfzreEoBwjvVSQGdNoETgbAfPnEuFCvNAA5PsAi
-FkpPwT1gYAd4gFe5SppgfmnDNo3PfDy1mQAWLrxs1MEC/hwzDKRYeZygXANA3YYCYODUYePtBRTA
-OAl9Ti4+fZfQ8uuPDGwA2707CQEWjQSHxcABJ0HJkHQX72WeYVMNgEQkgNMAoO7bv7lTw74bgO4S
-6BtjI4jDvjNBA6UAB6giQO8G9r2db99bBPvo7sU3zvGO14ACA3hUZWLKYY+b/OQoF2Yqdp2CwY05
-5TCP+RYGZ9MT7IzdMs+5zqkQXe+WfOdAD3oUovWARU8YaEJPutI9l3DKPxyaqkuPutR94OMyT/3q
-WO9BpGGM86x7/evmnLF7BmCAroP97GhXQYoyY/a0u/3tcI+73OdOd46HAAA7
---0016e64606800312ee04913db790--
+MIME-Version: 1.0
+Sender: digitalpebble@googlemail.com
+Received: by 10.231.31.200 with HTTP; Mon, 27 Sep 2010 06:29:16 -0700 (PDT)
+Date: Mon, 27 Sep 2010 14:29:16 +0100
+Delivered-To: digitalpebble@gmail.com
+X-Google-Sender-Auth: it4o8JYLKcQ5bHJbTpqDhZv46vk
+Message-ID: <AA...@mail.gmail.com>
+Subject: Test Multi Part Message
+From: DigitalPebble <ju...@digitalpebble.com>
+To: lists.digitalpebble@gmail.com
+Content-Type: multipart/mixed; boundary=0016e64606800312ee04913db790
+
+--0016e64606800312ee04913db790
+Content-Type: multipart/alternative; boundary=0016e64606800312ea04913db78e
+
+--0016e64606800312ea04913db78e
+Content-Type: text/plain; charset=UTF-8
+
+This is a test for parsing multi-part mails. With some funky HTML code an a
+picture attached.
+
+Text specific to body 1.
+
+-- 
+**
+*
+Open Source Solutions for Text Engineering
+
+http://digitalpebble.blogspot.com
+http://www.digitalpebble.com*
+
+--0016e64606800312ea04913db78e
+Content-Type: text/html; charset=UTF-8
+Content-Transfer-Encoding: quoted-printable
+
+This is a test for parsing multi-part mails. With<span style=3D"color: rgb(=
+204, 0, 0);"> some funky HTML code</span> an a picture attached.<br clear=
+=3D"all"><br>-- <br><font face=3D"arial, helvetica, sans-serif"><b><span st=
+yle=3D"font-family: arial; font-weight: normal;"><b style=3D"color: rgb(0, =
+0, 0); font-family: arial,helvetica,sans-serif;"><img src=3D"http://digital=
+pebble.com/img/logo.gif" height=3D"38" width=3D"200"></b></span></b></font>=
+<div>
+<font face=3D"arial, helvetica, sans-serif"><b><span style=3D"font-family: =
+arial; font-weight: normal;"><b style=3D"color: rgb(0, 0, 0); font-family: =
+arial,helvetica,sans-serif;"><span style=3D"font-size: x-small;">=C2=A0</sp=
+an><br style=3D"font-family: arial,helvetica,sans-serif;">
+</b><span style=3D"color: rgb(102, 102, 102); font-family: arial,helvetica,=
+sans-serif;"><span style=3D"color: rgb(51, 51, 51);">Open Source Solutions =
+for Text Engineering</span><br>
+<span style=3D"font-size: x-small;">=C2=A0</span><br>
+</span></span><span style=3D"color: rgb(102, 102, 102);"><span style=3D"fon=
+t-weight: normal;"><a href=3D"http://digitalpebble.blogspot.com" target=3D"=
+_blank">http://digitalpebble.blogspot.com</a></span></span><span style=3D"f=
+ont-weight: normal;"><br style=3D"color: rgb(102, 102, 102);">
+</span>Text specific to body 2.
+<span style=3D"color: rgb(102, 102, 102);"><span style=3D"font-weight: norm=
+al;"><a href=3D"http://www.digitalpebble.com" target=3D"_blank">http://www.=
+digitalpebble.com</a></span></span></b></font></div><br>
+
+--0016e64606800312ea04913db78e--
+--0016e64606800312ee04913db790
+Content-Type: image/gif; name="logo.gif"
+Content-Disposition: attachment; filename="logo.gif"
+Content-Transfer-Encoding: base64
+X-Attachment-Id: f_geldjvqq0
+
+R0lGODlhNgE8AMQAALxlVPv19JmZmaysrNnZ2cR4acXFxaWlpd2yqeXl5dWelObFv82Lfu7Y1Ozs
+7L+/v////8BuXtGVibKysszMzPLi39/f3/fs6tmonsiBdOrPyeK7tP4BAgAAAAAAAAAAACH5BAUU
+ABwALAAAAAA2ATwAAAX/ICSOZGmeaKqubOu+cCzPdG3feK7vfO//wKBwSCwaj8ikcslsOp/QqHRK
+rVqv2KwW2mj0KprfZbEtm7eBBoIBACByjY0kAmDsKgtFoY0lTP4TDiYOgBMERISAh1IGhScLCJES
+DJSVEpEaFUxtnG44bJ12OQidfFcUAqkCCSYJqgIURK6qsSoGrEUTryegpb6cEgsBSL5vN71tojik
+pVioqrgks6m1QtOwKQSp0UO6qry/4ZwRGMNFxZ+lyjfMnc6v3CLX1UHzJ4TQRt6p4OL+EZqIoDum
+Tkc7Tu/ylbA3hGGJZ9v07TKBzF84L0MG2qi4zsZBU1YgrjIRgIJJCvF+/5Q8mRKCyJY/9gnoZ1Fc
+hAsZSxnbWHCUr4QRy7yU+I2iry5dNCCYI05Bzk47a3A0+PMUPDNDc00sUREAigV7fpkDolFqz2VV
+Q14VupaITJoIUQSQ8ItMkLI0pvpsZlXhlqxut5LoqiJA2E4YhOCdoRct3yUWKAAyQGAsYGkJMo9V
+QaDRH8qbM2s+ITpB6MwPXqEUXTjyg0IGULZ4a/QxCgy+OpoYg6ESAwTCWqC7sEFBJQwbArKYGmBB
+b0oKEGB08dHriwrFLSEIE0PkK1jDLstTXTj1d1rDrqV0COG8exQEBrj/TmGzCdpc06JokDtFg66d
+SDAdCr4ooIA4BSBgX/9tnGSAAB2/RKAATixU5wJYNk3YggXzvTJASW2NwF4JHHaYygAOqNcKeSOY
+WBQJDhzg4isHCIICfoPpdwJ/Z5FwAYC/SLDgCDVFyB0KQFoUFQoWrtDAYUquoM2MAhxgQIjj0QIf
+ldRgCQF7VJbgAJffDQkBjiMQpgKPoZhQAYRFAgAQgXH6khiSdapjpghNprBAngBkYOY1qUywGgHm
+ubceiwud98BqFCR63qJajnCSTI+eREIA5w1AgQWiRerojYKlqaMJ1S3ZAJx1zmkCoKXcySCsGajQ
+5wkbwBrokDKqckBKAcjkl4iMktBrKr+2cqyXI4og3giSjnSPfK8Miab/CGqmAGUbdonwZjgZULJt
+gwsiSEkG/hyZY4SV+CMrqqeO8Gc4lbDaiVMP0bhntNJKU6yzHu5JLbP/AjzsCJx6WcJ3LV0LQba4
+/kKhCOiWEoGCPlbnSQm/aEiCBnT5EsGCXTEwIAQV4PbLySPc6qO9ABTQrbchl3LysjamMHBQ/lZK
+wnc5n5AwwT6T8KxLr0zAwpVFj+AwxJtqXGvLIitXwredjMxxrCloADO+6zaYQgUVd1IAk/E+7Iug
+X/U3QompPMACoZRSQ+IrBrDgXd3YmHC0SBawAHffJTzty8Qoa4ABzNySALPVJszbyQZbQ6WC5OPY
+19gJWHfCMgQue+tL/wF7QnBgKcoxnUrQKSzLNz0QSFp6i0Tb7bfCIgSQAOwoNHtmqdjqysk6mLcB
+trY9QrBYfqXMrHabthZ4Quim++L8bhrt7ILq/RLb9LEHuCDT6yccLYPvhgt/Ewk1cwL59Dou/7H0
+YSezQgCjw2/bCPZG0EJjSXPB4Mj3M1XIrQXP8p35XOAARNHnBOmDVQEgt62zsUADvrCa/EjgNlNB
+TwVQ49OpLkC/FahsHLl7xQFZMKaDfalYvitf7QhnNNz1jgCRElbBfvei+tVJSK8S3lE4qBMWjIt5
+w2PB6TwHr/1BgE1CjEsMTzBDekyxUS5UoA1H4EAy8Y6H/JhVkRTwOf8oRhEAA9qgB93hQ900kYkl
+CJ3GhJcl27Wgij2z4wpUhEU9WmqL3uHSFyPYIHX8Rl0mMGMUNVjEFUBtc2iz2RvjQoI5wsp/L2ya
+CvDoPT+mgI95pOEfXTiCnc3HUCnaISFBEgNFCrFynFgSXFj5vCSuoDosk+MZbXnFhWURhjuU4S81
+iTRSQsCUhXqUaUI5SODVkpKt3GUbpkakRqqgbG3A5BrtZ8IMTpKWoJMmAGSlQhcMrXt1pOE5V7iC
+BO7wb995AEzS2cwebhOcLiChNW2gRhF0MHgfTEEIw+nE4qHxBh5yASg7KcqEtkBSBMwXKb+zCM6o
+0pkDdcE/a9DPCtj/CYncVIG9qFnJUylSljGQyewMxjOGwk6ld+SkRFuaTnamQCT1DCNI8ekCbMpp
+pSzoZ3WcB8lElrCkTsRf8mQgkoqqYGcRZakAnIqCwaEzk54sJk21KoDAsWBnOZ2JGK0zgxPGEgdq
+NIwvNLfUErSvDYgUoRMh4NOD1uCc4dsjw1akSX2tQFhRHeVWFziCFhJzlWSVgUct9j6SqECNS+TE
+8QBqy7aVwoLfTOwIclUKtt0vBZLK21P32kdRxq6cKeDeVrXoQpx+9YH3wWjaXNCVCMS1kpg9gS8k
+IJfIcoJlFSEdCji7T6R24n0BgFkGGps7CUwWRhQF7aT4mtV4Dkm1/6t9Z8GukVfpxhOCsp3rC1xZ
+hwUo5wINWFzj6NRZDZjjAopj3HOfmc0FUCgNCBhXzFJQHcpl1ngamBgYloi4Eqj2AfZJADKv6rsp
++YoCNgoAAZYlUxJYNSXWJQ2FCwVee1IWmmXVVW5hKcEhJalIzCWo2VBQ1zGO9juGooABdgZV6ppW
+BPzqUI1L+0XDnsccgTQAqBLQmWXtTGmx9TB9NTsD38bpev5UH3NPbBEojwCDR0VYi2tS4BIsuFMD
+tPEXcUwlHFbYWPOJhg47hGDtJVmnPmRyk2nFXkAtV6C6su396LWgAGzZXe100QR0d2YSbdhXrBAJ
+67B6YwhYtWg5Pv/PIgD3ZrHuVM4z0IB+6dXlKGfzreEoBwjvVSQGdNoETgbAfPnEuFCvNAA5PsAi
+FkpPwT1gYAd4gFe5SppgfmnDNo3PfDy1mQAWLrxs1MEC/hwzDKRYeZygXANA3YYCYODUYePtBRTA
+OAl9Ti4+fZfQ8uuPDGwA2707CQEWjQSHxcABJ0HJkHQX72WeYVMNgEQkgNMAoO7bv7lTw74bgO4S
+6BtjI4jDvjNBA6UAB6giQO8G9r2db99bBPvo7sU3zvGO14ACA3hUZWLKYY+b/OQoF2Yqdp2CwY05
+5TCP+RYGZ9MT7IzdMs+5zqkQXe+WfOdAD3oUovWARU8YaEJPutI9l3DKPxyaqkuPutR94OMyT/3q
+WO9BpGGM86x7/evmnLF7BmCAroP97GhXQYoyY/a0u/3tcI+73OdOd46HAAA7
+--0016e64606800312ee04913db790--

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart
------------------------------------------------------------------------------
    svn:eol-style = CRLF

Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64?rev=1041811&r1=1041810&r2=1041811&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64 (original)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64 Fri Dec  3 13:39:49 2010
@@ -1,8 +1,8 @@
-To: Nobody <no...@somewhere.com>
-From: Nowhere <no...@nowhere.com>
-Subject: This tests a base64 encoded body
-MIME-Version: 1.0
-Content-Type: text/plain; charset=ISO-8859-1
-Content-Transfer-Encoding: base64
-
-SGVyZSBpcyBzb21lIHRleHQsIHdpdGggaW50ZXJuYXRpb25hbCBjaGFyYWN0ZXJzLCB2b2ls4CE=
+To: Nobody <no...@somewhere.com>
+From: Nowhere <no...@nowhere.com>
+Subject: This tests a base64 encoded body
+MIME-Version: 1.0
+Content-Type: text/plain; charset=ISO-8859-1
+Content-Transfer-Encoding: base64
+
+SGVyZSBpcyBzb21lIHRleHQsIHdpdGggaW50ZXJuYXRpb25hbCBjaGFyYWN0ZXJzLCB2b2ls4CE=

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64
------------------------------------------------------------------------------
    svn:eol-style = CRLF

Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders?rev=1041811&r1=1041810&r2=1041811&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders (original)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders Fri Dec  3 13:39:49 2010
@@ -1,9 +1,9 @@
-From: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <ke...@dkuug.dk>
-To: Nobody in Particular <a....@example.com>
-Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXM=?=
- =?ISO-8859-2?B?eW91IHVuZGVyc3RhbmQgdGhlIGV4YW1wbGUu?=
-MIME-Version: 1.0
-Content-type: text/plain
-Content-transfer-encoding: 7bit
-
-Examples taken from RFC 2047. 
+From: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <ke...@dkuug.dk>
+To: Nobody in Particular <a....@example.com>
+Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=
+ =?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=
+MIME-Version: 1.0
+Content-type: text/plain
+Content-transfer-encoding: 7bit
+
+Examples taken from RFC 2047. 

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders
------------------------------------------------------------------------------
    svn:eol-style = CRLF

Modified: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted?rev=1041811&r1=1041810&r2=1041811&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted (original)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted Fri Dec  3 13:39:49 2010
@@ -1,13 +1,13 @@
-Delivered-To: a.person@example.com
-Return-Path: <an...@another-example.com>
-MIME-Version: 1.0
-Date: Fri, 26 Nov 2010 19:57:53 +0000
-Subject: Sample with Quoted Printable Text
-From: Another Person <an...@another-example.com>
-To: A. Person <a....@example.com>
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: quoted-printable
-
-D=C3=BCsseldorf has non-ascii. Lines can be spl=
-it like this. Spaces at the end of a line=20
-must be encoded.
+Delivered-To: a.person@example.com
+Return-Path: <an...@another-example.com>
+MIME-Version: 1.0
+Date: Fri, 26 Nov 2010 19:57:53 +0000
+Subject: Sample with Quoted Printable Text
+From: Another Person <an...@another-example.com>
+To: A. Person <a....@example.com>
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: quoted-printable
+
+D=C3=BCsseldorf has non-ascii. Lines can be spl=
+it like this. Spaces at the end of a line=20
+must be encoded.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted
------------------------------------------------------------------------------
    svn:eol-style = CRLF