You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/12/01 14:00:58 UTC

svn commit: r1040992 - in /tika/trunk/tika-parsers: ./ src/main/java/org/apache/tika/parser/mail/ src/main/resources/META-INF/services/ src/test/java/org/apache/tika/parser/mail/ src/test/resources/test-documents/

Author: jukka
Date: Wed Dec  1 13:00:57 2010
New Revision: 1040992

URL: http://svn.apache.org/viewvc?rev=1040992&view=rev
Log:
TIKA-461: RFC822 messages not parsed

Patch by Benjamin Douglas and Julien Nioche

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java   (with props)
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java   (with props)
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders
    tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted
Modified:
    tika/trunk/tika-parsers/pom.xml
    tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1040992&r1=1040991&r2=1040992&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Wed Dec  1 13:00:57 2010
@@ -55,6 +55,11 @@
       <version>3.1</version>
     </dependency>
     <dependency>
+      <groupId>org.apache.james</groupId>
+      <artifactId>apache-mime4j</artifactId>
+      <version>0.6</version>
+    </dependency>
+    <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-compress</artifactId>
       <version>1.1</version>

Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Wed Dec  1 13:00:57 2010
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.descriptor.BodyDescriptor;
+import org.apache.james.mime4j.parser.Field;
+import org.apache.james.mime4j.parser.MimeStreamParser;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Uses apache-mime4j to parse emails. Each part is treated with the
+ * corresponding parser and displayed within
+ * <p>
+ * elements.
+ * 
+ * @author jnioche@digitalpebble.com
+ **/
+public class RFC822Parser implements Parser {
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections
+            .singleton(MediaType.parse("message/rfc822"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException,
+            SAXException, TikaException {
+        MimeStreamParser parser = new MimeStreamParser();
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+        MailContentHandler mch = new MailContentHandler(xhtml, metadata);
+        parser.setContentHandler(mch);
+        try {
+            parser.parse(stream);
+        } catch (MimeException e) {
+            throw new TikaException(e.getMessage());
+        }
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+            Metadata metadata) throws IOException, SAXException, TikaException {
+        parse(stream, handler, metadata, new ParseContext());
+    }
+
+}
+
+/**
+ * Same as BodyContentHandler but does not even propagate the start |
+ * endDocument events
+ **/
+class StrictBodyContentHandler extends BodyContentHandler {
+
+    public StrictBodyContentHandler(XHTMLContentHandler handler) {
+        super(handler);
+    }
+
+    public void startDocument() throws SAXException {
+    }
+
+    public void endDocument() throws SAXException {
+    }
+
+}
+
+/**
+ * Bridge between mime4j's content handler and the generic Sax content handler
+ * used by Tika. See
+ * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j
+ * /parser/ContentHandler.html
+ */
+class MailContentHandler implements
+        org.apache.james.mime4j.parser.ContentHandler {
+
+    private XHTMLContentHandler handler;
+    private Metadata metadata;
+
+    private boolean inPart = false;
+
+    MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+        this.handler = xhtml;
+        this.metadata = metadata;
+    }
+
+    public void body(BodyDescriptor body, InputStream is) throws MimeException,
+            IOException {
+        // call the underlying parser for the part
+        // TODO how to retrieve a non-default config?
+        AutoDetectParser parser = new AutoDetectParser();
+        // use a different metadata object
+        // in order to specify the mime type of the
+        // sub part without damaging the main metadata
+
+        Metadata submd = new Metadata();
+        submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
+        submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
+
+        // filter the events coming from the underlying parser
+        // to prevent getting multiple </body> </html> or </title>
+        // in the output
+
+        StrictBodyContentHandler bch = new StrictBodyContentHandler(handler);
+
+        try {
+            parser.parse(is, bch, submd);
+        } catch (SAXException e) {
+            e.printStackTrace();
+        } catch (TikaException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void endBodyPart() throws MimeException {
+        try {
+            handler.endElement("p");
+            handler.endElement("div");
+        } catch (SAXException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void endHeader() throws MimeException {
+    }
+
+    public void startMessage() throws MimeException {
+        try {
+            handler.startDocument();
+        } catch (SAXException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void endMessage() throws MimeException {
+        try {
+            handler.endDocument();
+        } catch (SAXException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void endMultipart() throws MimeException {
+        inPart = false;
+    }
+
+    public void epilogue(InputStream is) throws MimeException, IOException {
+    }
+
+    /**
+     * Header for the whole message or its parts
+     * 
+     * @see http 
+     *      ://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/
+     *      Field.html
+     **/
+    public void field(Field field) throws MimeException {
+        // inPart indicates whether these metadata correspond to the
+        // whole message or its parts
+        if (inPart)
+            return;
+        // TODO add metadata to the parts later
+        String fieldname = field.getName();
+        // TODO value could be parsed and/or encoded
+        String value = field.getBody();
+        if (fieldname.equalsIgnoreCase("From")) {
+            metadata.add(Metadata.AUTHOR, value);
+        } else if (fieldname.equalsIgnoreCase("Subject")) {
+            metadata.add(Metadata.SUBJECT, value);
+        }
+    }
+
+    public void preamble(InputStream is) throws MimeException, IOException {
+    }
+
+    public void raw(InputStream is) throws MimeException, IOException {
+    }
+
+    public void startBodyPart() throws MimeException {
+        try {
+            handler.startElement("div", "class", "email-entry");
+            handler.startElement("p");
+        } catch (SAXException e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void startHeader() throws MimeException {
+        // TODO Auto-generated method stub
+
+    }
+
+    public void startMultipart(BodyDescriptor descr) throws MimeException {
+        inPart = true;
+    }
+
+}

Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1040992&r1=1040991&r2=1040992&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Dec  1 13:00:57 2010
@@ -26,6 +26,7 @@ org.apache.tika.parser.image.TiffParser
 org.apache.tika.parser.iwork.IWorkParser
 org.apache.tika.parser.iwork.IWorkPackageParser
 org.apache.tika.parser.jpeg.JpegParser
+org.apache.tika.parser.mail.RFC822Parser
 org.apache.tika.parser.mbox.MboxParser
 org.apache.tika.parser.microsoft.OfficeParser
 org.apache.tika.parser.microsoft.ooxml.OOXMLParser

Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Wed Dec  1 13:00:57 2010
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.verify;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class RFC822ParserTest extends TestCase {
+
+    public void testSimple() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822");
+        ContentHandler handler = mock(DefaultHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            verify(handler).startDocument();
+            //just one body
+            verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+            verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
+            //no multi-part body parts
+            verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
+            verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
+            verify(handler).endDocument();
+            //note no leading spaces
+            assertEquals("\"Julien Nioche (JIRA)\" <ji...@apache.org>", metadata.get(Metadata.AUTHOR));
+            assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(Metadata.SUBJECT));
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    public void testQuotedPrintable() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822_quoted");
+        ContentHandler handler = mock(DefaultHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
+            verify(handler).characters(new String("D\u00FCsseldorf has non-ascii. "
+            	+ "Lines can be split like this. Spaces at the end of a line \r\n"
+            	+ "must be encoded.\r\n").toCharArray(), 0, 104);
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    public void testBase64() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822_base64");
+        ContentHandler handler = mock(DefaultHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
+            verify(handler).characters(new String(
+            	"Here is some text, with international characters, voil\u00E0!\r\n"
+            	).toCharArray(), 0, 58);
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+    
+    public void testI18NHeaders() {
+        Parser parser = new RFC822Parser();
+        Metadata metadata = new Metadata();
+        InputStream stream = getStream("test-documents/testRFC822_i18nheaders");
+        ContentHandler handler = mock(DefaultHandler.class);
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+            //tests correct decoding of internationalized headers, both
+            //quoted-printable (Q) and Base64 (B).
+            assertEquals("Keld J\u00F8rn Simonsen <ke...@dkuug.dk>", metadata.get(Metadata.AUTHOR));
+            assertEquals("If you can read this you understand the example.", metadata.get(Metadata.SUBJECT));
+        } catch (Exception e) {
+            fail("Exception thrown: " + e.getMessage());
+        }
+    }
+
+    private static InputStream getStream(String name) {
+        return Thread.currentThread().getContextClassLoader()
+                .getResourceAsStream(name);
+    }
+
+}

Propchange: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822 (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822 Wed Dec  1 13:00:57 2010
@@ -0,0 +1,41 @@
+From: "Julien Nioche (JIRA)" <ji...@apache.org>
+To: dev@tika.apache.org
+Subject: [jira] Commented: (TIKA-461) RFC822 messages not parsed
+Reply-To: dev@tika.apache.org
+Delivered-To: mailing list dev@tika.apache.org
+Date: Mon, 6 Sep 2010 05:25:34 -0400 (EDT)
+In-Reply-To: <60...@thor>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+
+    [ https://issues.apache.org/jira/browse/TIKA-461?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12906468#action_12906468 ] 
+
+Julien Nioche commented on TIKA-461:
+------------------------------------
+
+I'll have a look at mime4j and try to use it in Tika
+
+> RFC822 messages not parsed
+> --------------------------
+>
+>                 Key: TIKA-461
+>                 URL: https://issues.apache.org/jira/browse/TIKA-461
+>             Project: Tika
+>          Issue Type: Bug
+>          Components: parser
+>    Affects Versions: 0.7
+>            Reporter: Joshua Turner
+>            Assignee: Julien Nioche
+>
+> Presented with an RFC822 message exported from Thunderbird, AutodetectParser produces an empty body, and a Metadata containing only one key-value pair: "Content-Type=message/rfc822". Directly calling MboxParser likewise gives an empty body, but with two metadata pairs: "Content-Encoding=us-ascii Content-Type=application/mbox".
+> A quick peek at the source of MboxParser shows that the implementation is pretty naive. If the wiring can be sorted out, something like Apache James' mime4j might be a better bet.
+
+-- 
+This message is automatically generated by JIRA.
+-
+You can reply to this email to add a comment to the issue online.
+

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart Wed Dec  1 13:00:57 2010
@@ -0,0 +1,110 @@
+                                                                                                                                                                                                                                                               
+MIME-Version: 1.0
+Sender: digitalpebble@googlemail.com
+Received: by 10.231.31.200 with HTTP; Mon, 27 Sep 2010 06:29:16 -0700 (PDT)
+Date: Mon, 27 Sep 2010 14:29:16 +0100
+Delivered-To: digitalpebble@gmail.com
+X-Google-Sender-Auth: it4o8JYLKcQ5bHJbTpqDhZv46vk
+Message-ID: <AA...@mail.gmail.com>
+Subject: Test Multi Part Message
+From: DigitalPebble <ju...@digitalpebble.com>
+To: lists.digitalpebble@gmail.com
+Content-Type: multipart/mixed; boundary=0016e64606800312ee04913db790
+
+--0016e64606800312ee04913db790
+Content-Type: multipart/alternative; boundary=0016e64606800312ea04913db78e
+
+--0016e64606800312ea04913db78e
+Content-Type: text/plain; charset=UTF-8
+
+This is a test for parsing multi-part mails. With some funky HTML code an a
+picture attached.
+
+-- 
+**
+*
+Open Source Solutions for Text Engineering
+
+http://digitalpebble.blogspot.com
+http://www.digitalpebble.com*
+
+--0016e64606800312ea04913db78e
+Content-Type: text/html; charset=UTF-8
+Content-Transfer-Encoding: quoted-printable
+
+This is a test for parsing multi-part mails. With<span style=3D"color: rgb(=
+204, 0, 0);"> some funky HTML code</span> an a picture attached.<br clear=
+=3D"all"><br>-- <br><font face=3D"arial, helvetica, sans-serif"><b><span st=
+yle=3D"font-family: arial; font-weight: normal;"><b style=3D"color: rgb(0, =
+0, 0); font-family: arial,helvetica,sans-serif;"><img src=3D"http://digital=
+pebble.com/img/logo.gif" height=3D"38" width=3D"200"></b></span></b></font>=
+<div>
+<font face=3D"arial, helvetica, sans-serif"><b><span style=3D"font-family: =
+arial; font-weight: normal;"><b style=3D"color: rgb(0, 0, 0); font-family: =
+arial,helvetica,sans-serif;"><span style=3D"font-size: x-small;">=C2=A0</sp=
+an><br style=3D"font-family: arial,helvetica,sans-serif;">
+</b><span style=3D"color: rgb(102, 102, 102); font-family: arial,helvetica,=
+sans-serif;"><span style=3D"color: rgb(51, 51, 51);">Open Source Solutions =
+for Text Engineering</span><br>
+<span style=3D"font-size: x-small;">=C2=A0</span><br>
+</span></span><span style=3D"color: rgb(102, 102, 102);"><span style=3D"fon=
+t-weight: normal;"><a href=3D"http://digitalpebble.blogspot.com" target=3D"=
+_blank">http://digitalpebble.blogspot.com</a></span></span><span style=3D"f=
+ont-weight: normal;"><br style=3D"color: rgb(102, 102, 102);">
+</span>
+<span style=3D"color: rgb(102, 102, 102);"><span style=3D"font-weight: norm=
+al;"><a href=3D"http://www.digitalpebble.com" target=3D"_blank">http://www.=
+digitalpebble.com</a></span></span></b></font></div><br>
+
+--0016e64606800312ea04913db78e--
+--0016e64606800312ee04913db790
+Content-Type: image/gif; name="logo.gif"
+Content-Disposition: attachment; filename="logo.gif"
+Content-Transfer-Encoding: base64
+X-Attachment-Id: f_geldjvqq0
+
+R0lGODlhNgE8AMQAALxlVPv19JmZmaysrNnZ2cR4acXFxaWlpd2yqeXl5dWelObFv82Lfu7Y1Ozs
+7L+/v////8BuXtGVibKysszMzPLi39/f3/fs6tmonsiBdOrPyeK7tP4BAgAAAAAAAAAAACH5BAUU
+ABwALAAAAAA2ATwAAAX/ICSOZGmeaKqubOu+cCzPdG3feK7vfO//wKBwSCwaj8ikcslsOp/QqHRK
+rVqv2KwW2mj0KprfZbEtm7eBBoIBACByjY0kAmDsKgtFoY0lTP4TDiYOgBMERISAh1IGhScLCJES
+DJSVEpEaFUxtnG44bJ12OQidfFcUAqkCCSYJqgIURK6qsSoGrEUTryegpb6cEgsBSL5vN71tojik
+pVioqrgks6m1QtOwKQSp0UO6qry/4ZwRGMNFxZ+lyjfMnc6v3CLX1UHzJ4TQRt6p4OL+EZqIoDum
+Tkc7Tu/ylbA3hGGJZ9v07TKBzF84L0MG2qi4zsZBU1YgrjIRgIJJCvF+/5Q8mRKCyJY/9gnoZ1Fc
+hAsZSxnbWHCUr4QRy7yU+I2iry5dNCCYI05Bzk47a3A0+PMUPDNDc00sUREAigV7fpkDolFqz2VV
+Q14VupaITJoIUQSQ8ItMkLI0pvpsZlXhlqxut5LoqiJA2E4YhOCdoRct3yUWKAAyQGAsYGkJMo9V
+QaDRH8qbM2s+ITpB6MwPXqEUXTjyg0IGULZ4a/QxCgy+OpoYg6ESAwTCWqC7sEFBJQwbArKYGmBB
+b0oKEGB08dHriwrFLSEIE0PkK1jDLstTXTj1d1rDrqV0COG8exQEBrj/TmGzCdpc06JokDtFg66d
+SDAdCr4ooIA4BSBgX/9tnGSAAB2/RKAATixU5wJYNk3YggXzvTJASW2NwF4JHHaYygAOqNcKeSOY
+WBQJDhzg4isHCIICfoPpdwJ/Z5FwAYC/SLDgCDVFyB0KQFoUFQoWrtDAYUquoM2MAhxgQIjj0QIf
+ldRgCQF7VJbgAJffDQkBjiMQpgKPoZhQAYRFAgAQgXH6khiSdapjpghNprBAngBkYOY1qUywGgHm
+ubceiwud98BqFCR63qJajnCSTI+eREIA5w1AgQWiRerojYKlqaMJ1S3ZAJx1zmkCoKXcySCsGajQ
+5wkbwBrokDKqckBKAcjkl4iMktBrKr+2cqyXI4og3giSjnSPfK8Miab/CGqmAGUbdonwZjgZULJt
+gwsiSEkG/hyZY4SV+CMrqqeO8Gc4lbDaiVMP0bhntNJKU6yzHu5JLbP/AjzsCJx6WcJ3LV0LQba4
+/kKhCOiWEoGCPlbnSQm/aEiCBnT5EsGCXTEwIAQV4PbLySPc6qO9ABTQrbchl3LysjamMHBQ/lZK
+wnc5n5AwwT6T8KxLr0zAwpVFj+AwxJtqXGvLIitXwredjMxxrCloADO+6zaYQgUVd1IAk/E+7Iug
+X/U3QompPMACoZRSQ+IrBrDgXd3YmHC0SBawAHffJTzty8Qoa4ABzNySALPVJszbyQZbQ6WC5OPY
+19gJWHfCMgQue+tL/wF7QnBgKcoxnUrQKSzLNz0QSFp6i0Tb7bfCIgSQAOwoNHtmqdjqysk6mLcB
+trY9QrBYfqXMrHabthZ4Quim++L8bhrt7ILq/RLb9LEHuCDT6yccLYPvhgt/Ewk1cwL59Dou/7H0
+YSezQgCjw2/bCPZG0EJjSXPB4Mj3M1XIrQXP8p35XOAARNHnBOmDVQEgt62zsUADvrCa/EjgNlNB
+TwVQ49OpLkC/FahsHLl7xQFZMKaDfalYvitf7QhnNNz1jgCRElbBfvei+tVJSK8S3lE4qBMWjIt5
+w2PB6TwHr/1BgE1CjEsMTzBDekyxUS5UoA1H4EAy8Y6H/JhVkRTwOf8oRhEAA9qgB93hQ900kYkl
+CJ3GhJcl27Wgij2z4wpUhEU9WmqL3uHSFyPYIHX8Rl0mMGMUNVjEFUBtc2iz2RvjQoI5wsp/L2ya
+CvDoPT+mgI95pOEfXTiCnc3HUCnaISFBEgNFCrFynFgSXFj5vCSuoDosk+MZbXnFhWURhjuU4S81
+iTRSQsCUhXqUaUI5SODVkpKt3GUbpkakRqqgbG3A5BrtZ8IMTpKWoJMmAGSlQhcMrXt1pOE5V7iC
+BO7wb995AEzS2cwebhOcLiChNW2gRhF0MHgfTEEIw+nE4qHxBh5yASg7KcqEtkBSBMwXKb+zCM6o
+0pkDdcE/a9DPCtj/CYncVIG9qFnJUylSljGQyewMxjOGwk6ld+SkRFuaTnamQCT1DCNI8ekCbMpp
+pSzoZ3WcB8lElrCkTsRf8mQgkoqqYGcRZakAnIqCwaEzk54sJk21KoDAsWBnOZ2JGK0zgxPGEgdq
+NIwvNLfUErSvDYgUoRMh4NOD1uCc4dsjw1akSX2tQFhRHeVWFziCFhJzlWSVgUct9j6SqECNS+TE
+8QBqy7aVwoLfTOwIclUKtt0vBZLK21P32kdRxq6cKeDeVrXoQpx+9YH3wWjaXNCVCMS1kpg9gS8k
+IJfIcoJlFSEdCji7T6R24n0BgFkGGps7CUwWRhQF7aT4mtV4Dkm1/6t9Z8GukVfpxhOCsp3rC1xZ
+hwUo5wINWFzj6NRZDZjjAopj3HOfmc0FUCgNCBhXzFJQHcpl1ngamBgYloi4Eqj2AfZJADKv6rsp
++YoCNgoAAZYlUxJYNSXWJQ2FCwVee1IWmmXVVW5hKcEhJalIzCWo2VBQ1zGO9juGooABdgZV6ppW
+BPzqUI1L+0XDnsccgTQAqBLQmWXtTGmx9TB9NTsD38bpev5UH3NPbBEojwCDR0VYi2tS4BIsuFMD
+tPEXcUwlHFbYWPOJhg47hGDtJVmnPmRyk2nFXkAtV6C6su396LWgAGzZXe100QR0d2YSbdhXrBAJ
+67B6YwhYtWg5Pv/PIgD3ZrHuVM4z0IB+6dXlKGfzreEoBwjvVSQGdNoETgbAfPnEuFCvNAA5PsAi
+FkpPwT1gYAd4gFe5SppgfmnDNo3PfDy1mQAWLrxs1MEC/hwzDKRYeZygXANA3YYCYODUYePtBRTA
+OAl9Ti4+fZfQ8uuPDGwA2707CQEWjQSHxcABJ0HJkHQX72WeYVMNgEQkgNMAoO7bv7lTw74bgO4S
+6BtjI4jDvjNBA6UAB6giQO8G9r2db99bBPvo7sU3zvGO14ACA3hUZWLKYY+b/OQoF2Yqdp2CwY05
+5TCP+RYGZ9MT7IzdMs+5zqkQXe+WfOdAD3oUovWARU8YaEJPutI9l3DKPxyaqkuPutR94OMyT/3q
+WO9BpGGM86x7/evmnLF7BmCAroP97GhXQYoyY/a0u/3tcI+73OdOd46HAAA7
+--0016e64606800312ee04913db790--

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64 (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64 Wed Dec  1 13:00:57 2010
@@ -0,0 +1,8 @@
+To: Nobody <no...@somewhere.com>
+From: Nowhere <no...@nowhere.com>
+Subject: This tests a base64 encoded body
+MIME-Version: 1.0
+Content-Type: text/plain; charset=ISO-8859-1
+Content-Transfer-Encoding: base64
+
+SGVyZSBpcyBzb21lIHRleHQsIHdpdGggaW50ZXJuYXRpb25hbCBjaGFyYWN0ZXJzLCB2b2ls4CE=

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders Wed Dec  1 13:00:57 2010
@@ -0,0 +1,9 @@
+From: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <ke...@dkuug.dk>
+To: Nobody in Particular <a....@example.com>
+Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXM=?=
+ =?ISO-8859-2?B?eW91IHVuZGVyc3RhbmQgdGhlIGV4YW1wbGUu?=
+MIME-Version: 1.0
+Content-type: text/plain
+Content-transfer-encoding: 7bit
+
+Examples taken from RFC 2047. 

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted Wed Dec  1 13:00:57 2010
@@ -0,0 +1,13 @@
+Delivered-To: a.person@example.com
+Return-Path: <an...@another-example.com>
+MIME-Version: 1.0
+Date: Fri, 26 Nov 2010 19:57:53 +0000
+Subject: Sample with Quoted Printable Text
+From: Another Person <an...@another-example.com>
+To: A. Person <a....@example.com>
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: quoted-printable
+
+D=C3=BCsseldorf has non-ascii. Lines can be spl=
+it like this. Spaces at the end of a line=20
+must be encoded.