You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/12/01 14:00:58 UTC
svn commit: r1040992 - in /tika/trunk/tika-parsers: ./
src/main/java/org/apache/tika/parser/mail/
src/main/resources/META-INF/services/
src/test/java/org/apache/tika/parser/mail/ src/test/resources/test-documents/
Author: jukka
Date: Wed Dec 1 13:00:57 2010
New Revision: 1040992
URL: http://svn.apache.org/viewvc?rev=1040992&view=rev
Log:
TIKA-461: RFC822 messages not parsed
Patch by Benjamin Douglas and Julien Nioche
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (with props)
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822
tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart
tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64
tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders
tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted
Modified:
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Modified: tika/trunk/tika-parsers/pom.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1040992&r1=1040991&r2=1040992&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Wed Dec 1 13:00:57 2010
@@ -55,6 +55,11 @@
<version>3.1</version>
</dependency>
<dependency>
+ <groupId>org.apache.james</groupId>
+ <artifactId>apache-mime4j</artifactId>
+ <version>0.6</version>
+ </dependency>
+ <dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.1</version>
Added: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java (added)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java Wed Dec 1 13:00:57 2010
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.james.mime4j.MimeException;
+import org.apache.james.mime4j.descriptor.BodyDescriptor;
+import org.apache.james.mime4j.parser.Field;
+import org.apache.james.mime4j.parser.MimeStreamParser;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Uses apache-mime4j to parse emails. Each part is treated with the
+ * corresponding parser and displayed within
+ * <p>
+ * elements.
+ *
+ * @author jnioche@digitalpebble.com
+ **/
+public class RFC822Parser implements Parser {
+
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .singleton(MediaType.parse("message/rfc822"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ MimeStreamParser parser = new MimeStreamParser();
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ MailContentHandler mch = new MailContentHandler(xhtml, metadata);
+ parser.setContentHandler(mch);
+ try {
+ parser.parse(stream);
+ } catch (MimeException e) {
+ throw new TikaException(e.getMessage());
+ }
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata) throws IOException, SAXException, TikaException {
+ parse(stream, handler, metadata, new ParseContext());
+ }
+
+}
+
+/**
+ * Same as BodyContentHandler but does not even propagate the start |
+ * endDocument events
+ **/
+class StrictBodyContentHandler extends BodyContentHandler {
+
+ public StrictBodyContentHandler(XHTMLContentHandler handler) {
+ super(handler);
+ }
+
+ public void startDocument() throws SAXException {
+ }
+
+ public void endDocument() throws SAXException {
+ }
+
+}
+
+/**
+ * Bridge between mime4j's content handler and the generic Sax content handler
+ * used by Tika. See
+ * http://james.apache.org/mime4j/apidocs/org/apache/james/mime4j
+ * /parser/ContentHandler.html
+ */
+class MailContentHandler implements
+ org.apache.james.mime4j.parser.ContentHandler {
+
+ private XHTMLContentHandler handler;
+ private Metadata metadata;
+
+ private boolean inPart = false;
+
+ MailContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+ this.handler = xhtml;
+ this.metadata = metadata;
+ }
+
+ public void body(BodyDescriptor body, InputStream is) throws MimeException,
+ IOException {
+ // call the underlying parser for the part
+ // TODO how to retrieve a non-default config?
+ AutoDetectParser parser = new AutoDetectParser();
+ // use a different metadata object
+ // in order to specify the mime type of the
+ // sub part without damaging the main metadata
+
+ Metadata submd = new Metadata();
+ submd.set(Metadata.CONTENT_TYPE, body.getMimeType());
+ submd.set(Metadata.CONTENT_ENCODING, body.getCharset());
+
+ // filter the events coming from the underlying parser
+ // to prevent getting multiple </body> </html> or </title>
+ // in the output
+
+ StrictBodyContentHandler bch = new StrictBodyContentHandler(handler);
+
+ try {
+ parser.parse(is, bch, submd);
+ } catch (SAXException e) {
+ e.printStackTrace();
+ } catch (TikaException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void endBodyPart() throws MimeException {
+ try {
+ handler.endElement("p");
+ handler.endElement("div");
+ } catch (SAXException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void endHeader() throws MimeException {
+ }
+
+ public void startMessage() throws MimeException {
+ try {
+ handler.startDocument();
+ } catch (SAXException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void endMessage() throws MimeException {
+ try {
+ handler.endDocument();
+ } catch (SAXException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void endMultipart() throws MimeException {
+ inPart = false;
+ }
+
+ public void epilogue(InputStream is) throws MimeException, IOException {
+ }
+
+ /**
+ * Header for the whole message or its parts
+ *
+ * @see http
+ * ://james.apache.org/mime4j/apidocs/org/apache/james/mime4j/parser/
+ * Field.html
+ **/
+ public void field(Field field) throws MimeException {
+ // inPart indicates whether these metadata correspond to the
+ // whole message or its parts
+ if (inPart)
+ return;
+ // TODO add metadata to the parts later
+ String fieldname = field.getName();
+ // TODO value could be parsed and/or encoded
+ String value = field.getBody();
+ if (fieldname.equalsIgnoreCase("From")) {
+ metadata.add(Metadata.AUTHOR, value);
+ } else if (fieldname.equalsIgnoreCase("Subject")) {
+ metadata.add(Metadata.SUBJECT, value);
+ }
+ }
+
+ public void preamble(InputStream is) throws MimeException, IOException {
+ }
+
+ public void raw(InputStream is) throws MimeException, IOException {
+ }
+
+ public void startBodyPart() throws MimeException {
+ try {
+ handler.startElement("div", "class", "email-entry");
+ handler.startElement("p");
+ } catch (SAXException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void startHeader() throws MimeException {
+ // TODO Auto-generated method stub
+
+ }
+
+ public void startMultipart(BodyDescriptor descr) throws MimeException {
+ inPart = true;
+ }
+
+}
Propchange: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1040992&r1=1040991&r2=1040992&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (original)
+++ tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Wed Dec 1 13:00:57 2010
@@ -26,6 +26,7 @@ org.apache.tika.parser.image.TiffParser
org.apache.tika.parser.iwork.IWorkParser
org.apache.tika.parser.iwork.IWorkPackageParser
org.apache.tika.parser.jpeg.JpegParser
+org.apache.tika.parser.mail.RFC822Parser
org.apache.tika.parser.mbox.MboxParser
org.apache.tika.parser.microsoft.OfficeParser
org.apache.tika.parser.microsoft.ooxml.OOXMLParser
Added: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java (added)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java Wed Dec 1 13:00:57 2010
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mail;
+
+import static org.mockito.Matchers.any;
+import static org.mockito.Matchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.verify;
+
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class RFC822ParserTest extends TestCase {
+
+ public void testSimple() {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822");
+ ContentHandler handler = mock(DefaultHandler.class);
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ verify(handler).startDocument();
+ //just one body
+ verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
+ verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
+ //no multi-part body parts
+ verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
+ verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
+ verify(handler).endDocument();
+ //note no leading spaces
+ assertEquals("\"Julien Nioche (JIRA)\" <ji...@apache.org>", metadata.get(Metadata.AUTHOR));
+ assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(Metadata.SUBJECT));
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ public void testQuotedPrintable() {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822_quoted");
+ ContentHandler handler = mock(DefaultHandler.class);
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ //tests correct decoding of quoted printable text, including UTF-8 bytes into Unicode
+ verify(handler).characters(new String("D\u00FCsseldorf has non-ascii. "
+ + "Lines can be split like this. Spaces at the end of a line \r\n"
+ + "must be encoded.\r\n").toCharArray(), 0, 104);
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ public void testBase64() {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822_base64");
+ ContentHandler handler = mock(DefaultHandler.class);
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ //tests correct decoding of base64 text, including ISO-8859-1 bytes into Unicode
+ verify(handler).characters(new String(
+ "Here is some text, with international characters, voil\u00E0!\r\n"
+ ).toCharArray(), 0, 58);
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ public void testI18NHeaders() {
+ Parser parser = new RFC822Parser();
+ Metadata metadata = new Metadata();
+ InputStream stream = getStream("test-documents/testRFC822_i18nheaders");
+ ContentHandler handler = mock(DefaultHandler.class);
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ //tests correct decoding of internationalized headers, both
+ //quoted-printable (Q) and Base64 (B).
+ assertEquals("Keld J\u00F8rn Simonsen <ke...@dkuug.dk>", metadata.get(Metadata.AUTHOR));
+ assertEquals("If you can read this you understand the example.", metadata.get(Metadata.SUBJECT));
+ } catch (Exception e) {
+ fail("Exception thrown: " + e.getMessage());
+ }
+ }
+
+ private static InputStream getStream(String name) {
+ return Thread.currentThread().getContextClassLoader()
+ .getResourceAsStream(name);
+ }
+
+}
Propchange: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822 (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822 Wed Dec 1 13:00:57 2010
@@ -0,0 +1,41 @@
+From: "Julien Nioche (JIRA)" <ji...@apache.org>
+To: dev@tika.apache.org
+Subject: [jira] Commented: (TIKA-461) RFC822 messages not parsed
+Reply-To: dev@tika.apache.org
+Delivered-To: mailing list dev@tika.apache.org
+Date: Mon, 6 Sep 2010 05:25:34 -0400 (EDT)
+In-Reply-To: <60...@thor>
+MIME-Version: 1.0
+Content-Type: text/plain; charset=utf-8
+Content-Transfer-Encoding: 7bit
+X-JIRA-FingerPrint: 30527f35849b9dde25b450d4833f0394
+X-Virus-Checked: Checked by ClamAV on apache.org
+
+
+ [ https://issues.apache.org/jira/browse/TIKA-461?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12906468#action_12906468 ]
+
+Julien Nioche commented on TIKA-461:
+------------------------------------
+
+I'll have a look at mime4j and try to use it in Tika
+
+> RFC822 messages not parsed
+> --------------------------
+>
+> Key: TIKA-461
+> URL: https://issues.apache.org/jira/browse/TIKA-461
+> Project: Tika
+> Issue Type: Bug
+> Components: parser
+> Affects Versions: 0.7
+> Reporter: Joshua Turner
+> Assignee: Julien Nioche
+>
+> Presented with an RFC822 message exported from Thunderbird, AutodetectParser produces an empty body, and a Metadata containing only one key-value pair: "Content-Type=message/rfc822". Directly calling MboxParser likewise gives an empty body, but with two metadata pairs: "Content-Encoding=us-ascii Content-Type=application/mbox".
+> A quick peek at the source of MboxParser shows that the implementation is pretty naive. If the wiring can be sorted out, something like Apache James' mime4j might be a better bet.
+
+--
+This message is automatically generated by JIRA.
+-
+You can reply to this email to add a comment to the issue online.
+
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822-multipart Wed Dec 1 13:00:57 2010
@@ -0,0 +1,110 @@
+
+MIME-Version: 1.0
+Sender: digitalpebble@googlemail.com
+Received: by 10.231.31.200 with HTTP; Mon, 27 Sep 2010 06:29:16 -0700 (PDT)
+Date: Mon, 27 Sep 2010 14:29:16 +0100
+Delivered-To: digitalpebble@gmail.com
+X-Google-Sender-Auth: it4o8JYLKcQ5bHJbTpqDhZv46vk
+Message-ID: <AA...@mail.gmail.com>
+Subject: Test Multi Part Message
+From: DigitalPebble <ju...@digitalpebble.com>
+To: lists.digitalpebble@gmail.com
+Content-Type: multipart/mixed; boundary=0016e64606800312ee04913db790
+
+--0016e64606800312ee04913db790
+Content-Type: multipart/alternative; boundary=0016e64606800312ea04913db78e
+
+--0016e64606800312ea04913db78e
+Content-Type: text/plain; charset=UTF-8
+
+This is a test for parsing multi-part mails. With some funky HTML code an a
+picture attached.
+
+--
+**
+*
+Open Source Solutions for Text Engineering
+
+http://digitalpebble.blogspot.com
+http://www.digitalpebble.com*
+
+--0016e64606800312ea04913db78e
+Content-Type: text/html; charset=UTF-8
+Content-Transfer-Encoding: quoted-printable
+
+This is a test for parsing multi-part mails. With<span style=3D"color: rgb(=
+204, 0, 0);"> some funky HTML code</span> an a picture attached.<br clear=
+=3D"all"><br>-- <br><font face=3D"arial, helvetica, sans-serif"><b><span st=
+yle=3D"font-family: arial; font-weight: normal;"><b style=3D"color: rgb(0, =
+0, 0); font-family: arial,helvetica,sans-serif;"><img src=3D"http://digital=
+pebble.com/img/logo.gif" height=3D"38" width=3D"200"></b></span></b></font>=
+<div>
+<font face=3D"arial, helvetica, sans-serif"><b><span style=3D"font-family: =
+arial; font-weight: normal;"><b style=3D"color: rgb(0, 0, 0); font-family: =
+arial,helvetica,sans-serif;"><span style=3D"font-size: x-small;">=C2=A0</sp=
+an><br style=3D"font-family: arial,helvetica,sans-serif;">
+</b><span style=3D"color: rgb(102, 102, 102); font-family: arial,helvetica,=
+sans-serif;"><span style=3D"color: rgb(51, 51, 51);">Open Source Solutions =
+for Text Engineering</span><br>
+<span style=3D"font-size: x-small;">=C2=A0</span><br>
+</span></span><span style=3D"color: rgb(102, 102, 102);"><span style=3D"fon=
+t-weight: normal;"><a href=3D"http://digitalpebble.blogspot.com" target=3D"=
+_blank">http://digitalpebble.blogspot.com</a></span></span><span style=3D"f=
+ont-weight: normal;"><br style=3D"color: rgb(102, 102, 102);">
+</span>
+<span style=3D"color: rgb(102, 102, 102);"><span style=3D"font-weight: norm=
+al;"><a href=3D"http://www.digitalpebble.com" target=3D"_blank">http://www.=
+digitalpebble.com</a></span></span></b></font></div><br>
+
+--0016e64606800312ea04913db78e--
+--0016e64606800312ee04913db790
+Content-Type: image/gif; name="logo.gif"
+Content-Disposition: attachment; filename="logo.gif"
+Content-Transfer-Encoding: base64
+X-Attachment-Id: f_geldjvqq0
+
+R0lGODlhNgE8AMQAALxlVPv19JmZmaysrNnZ2cR4acXFxaWlpd2yqeXl5dWelObFv82Lfu7Y1Ozs
+7L+/v////8BuXtGVibKysszMzPLi39/f3/fs6tmonsiBdOrPyeK7tP4BAgAAAAAAAAAAACH5BAUU
+ABwALAAAAAA2ATwAAAX/ICSOZGmeaKqubOu+cCzPdG3feK7vfO//wKBwSCwaj8ikcslsOp/QqHRK
+rVqv2KwW2mj0KprfZbEtm7eBBoIBACByjY0kAmDsKgtFoY0lTP4TDiYOgBMERISAh1IGhScLCJES
+DJSVEpEaFUxtnG44bJ12OQidfFcUAqkCCSYJqgIURK6qsSoGrEUTryegpb6cEgsBSL5vN71tojik
+pVioqrgks6m1QtOwKQSp0UO6qry/4ZwRGMNFxZ+lyjfMnc6v3CLX1UHzJ4TQRt6p4OL+EZqIoDum
+Tkc7Tu/ylbA3hGGJZ9v07TKBzF84L0MG2qi4zsZBU1YgrjIRgIJJCvF+/5Q8mRKCyJY/9gnoZ1Fc
+hAsZSxnbWHCUr4QRy7yU+I2iry5dNCCYI05Bzk47a3A0+PMUPDNDc00sUREAigV7fpkDolFqz2VV
+Q14VupaITJoIUQSQ8ItMkLI0pvpsZlXhlqxut5LoqiJA2E4YhOCdoRct3yUWKAAyQGAsYGkJMo9V
+QaDRH8qbM2s+ITpB6MwPXqEUXTjyg0IGULZ4a/QxCgy+OpoYg6ESAwTCWqC7sEFBJQwbArKYGmBB
+b0oKEGB08dHriwrFLSEIE0PkK1jDLstTXTj1d1rDrqV0COG8exQEBrj/TmGzCdpc06JokDtFg66d
+SDAdCr4ooIA4BSBgX/9tnGSAAB2/RKAATixU5wJYNk3YggXzvTJASW2NwF4JHHaYygAOqNcKeSOY
+WBQJDhzg4isHCIICfoPpdwJ/Z5FwAYC/SLDgCDVFyB0KQFoUFQoWrtDAYUquoM2MAhxgQIjj0QIf
+ldRgCQF7VJbgAJffDQkBjiMQpgKPoZhQAYRFAgAQgXH6khiSdapjpghNprBAngBkYOY1qUywGgHm
+ubceiwud98BqFCR63qJajnCSTI+eREIA5w1AgQWiRerojYKlqaMJ1S3ZAJx1zmkCoKXcySCsGajQ
+5wkbwBrokDKqckBKAcjkl4iMktBrKr+2cqyXI4og3giSjnSPfK8Miab/CGqmAGUbdonwZjgZULJt
+gwsiSEkG/hyZY4SV+CMrqqeO8Gc4lbDaiVMP0bhntNJKU6yzHu5JLbP/AjzsCJx6WcJ3LV0LQba4
+/kKhCOiWEoGCPlbnSQm/aEiCBnT5EsGCXTEwIAQV4PbLySPc6qO9ABTQrbchl3LysjamMHBQ/lZK
+wnc5n5AwwT6T8KxLr0zAwpVFj+AwxJtqXGvLIitXwredjMxxrCloADO+6zaYQgUVd1IAk/E+7Iug
+X/U3QompPMACoZRSQ+IrBrDgXd3YmHC0SBawAHffJTzty8Qoa4ABzNySALPVJszbyQZbQ6WC5OPY
+19gJWHfCMgQue+tL/wF7QnBgKcoxnUrQKSzLNz0QSFp6i0Tb7bfCIgSQAOwoNHtmqdjqysk6mLcB
+trY9QrBYfqXMrHabthZ4Quim++L8bhrt7ILq/RLb9LEHuCDT6yccLYPvhgt/Ewk1cwL59Dou/7H0
+YSezQgCjw2/bCPZG0EJjSXPB4Mj3M1XIrQXP8p35XOAARNHnBOmDVQEgt62zsUADvrCa/EjgNlNB
+TwVQ49OpLkC/FahsHLl7xQFZMKaDfalYvitf7QhnNNz1jgCRElbBfvei+tVJSK8S3lE4qBMWjIt5
+w2PB6TwHr/1BgE1CjEsMTzBDekyxUS5UoA1H4EAy8Y6H/JhVkRTwOf8oRhEAA9qgB93hQ900kYkl
+CJ3GhJcl27Wgij2z4wpUhEU9WmqL3uHSFyPYIHX8Rl0mMGMUNVjEFUBtc2iz2RvjQoI5wsp/L2ya
+CvDoPT+mgI95pOEfXTiCnc3HUCnaISFBEgNFCrFynFgSXFj5vCSuoDosk+MZbXnFhWURhjuU4S81
+iTRSQsCUhXqUaUI5SODVkpKt3GUbpkakRqqgbG3A5BrtZ8IMTpKWoJMmAGSlQhcMrXt1pOE5V7iC
+BO7wb995AEzS2cwebhOcLiChNW2gRhF0MHgfTEEIw+nE4qHxBh5yASg7KcqEtkBSBMwXKb+zCM6o
+0pkDdcE/a9DPCtj/CYncVIG9qFnJUylSljGQyewMxjOGwk6ld+SkRFuaTnamQCT1DCNI8ekCbMpp
+pSzoZ3WcB8lElrCkTsRf8mQgkoqqYGcRZakAnIqCwaEzk54sJk21KoDAsWBnOZ2JGK0zgxPGEgdq
+NIwvNLfUErSvDYgUoRMh4NOD1uCc4dsjw1akSX2tQFhRHeVWFziCFhJzlWSVgUct9j6SqECNS+TE
+8QBqy7aVwoLfTOwIclUKtt0vBZLK21P32kdRxq6cKeDeVrXoQpx+9YH3wWjaXNCVCMS1kpg9gS8k
+IJfIcoJlFSEdCji7T6R24n0BgFkGGps7CUwWRhQF7aT4mtV4Dkm1/6t9Z8GukVfpxhOCsp3rC1xZ
+hwUo5wINWFzj6NRZDZjjAopj3HOfmc0FUCgNCBhXzFJQHcpl1ngamBgYloi4Eqj2AfZJADKv6rsp
++YoCNgoAAZYlUxJYNSXWJQ2FCwVee1IWmmXVVW5hKcEhJalIzCWo2VBQ1zGO9juGooABdgZV6ppW
+BPzqUI1L+0XDnsccgTQAqBLQmWXtTGmx9TB9NTsD38bpev5UH3NPbBEojwCDR0VYi2tS4BIsuFMD
+tPEXcUwlHFbYWPOJhg47hGDtJVmnPmRyk2nFXkAtV6C6su396LWgAGzZXe100QR0d2YSbdhXrBAJ
+67B6YwhYtWg5Pv/PIgD3ZrHuVM4z0IB+6dXlKGfzreEoBwjvVSQGdNoETgbAfPnEuFCvNAA5PsAi
+FkpPwT1gYAd4gFe5SppgfmnDNo3PfDy1mQAWLrxs1MEC/hwzDKRYeZygXANA3YYCYODUYePtBRTA
+OAl9Ti4+fZfQ8uuPDGwA2707CQEWjQSHxcABJ0HJkHQX72WeYVMNgEQkgNMAoO7bv7lTw74bgO4S
+6BtjI4jDvjNBA6UAB6giQO8G9r2db99bBPvo7sU3zvGO14ACA3hUZWLKYY+b/OQoF2Yqdp2CwY05
+5TCP+RYGZ9MT7IzdMs+5zqkQXe+WfOdAD3oUovWARU8YaEJPutI9l3DKPxyaqkuPutR94OMyT/3q
+WO9BpGGM86x7/evmnLF7BmCAroP97GhXQYoyY/a0u/3tcI+73OdOd46HAAA7
+--0016e64606800312ee04913db790--
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64 (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_base64 Wed Dec 1 13:00:57 2010
@@ -0,0 +1,8 @@
+To: Nobody <no...@somewhere.com>
+From: Nowhere <no...@nowhere.com>
+Subject: This tests a base64 encoded body
+MIME-Version: 1.0
+Content-Type: text/plain; charset=ISO-8859-1
+Content-Transfer-Encoding: base64
+
+SGVyZSBpcyBzb21lIHRleHQsIHdpdGggaW50ZXJuYXRpb25hbCBjaGFyYWN0ZXJzLCB2b2ls4CE=
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_i18nheaders Wed Dec 1 13:00:57 2010
@@ -0,0 +1,9 @@
+From: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <ke...@dkuug.dk>
+To: Nobody in Particular <a....@example.com>
+Subject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXM=?=
+ =?ISO-8859-2?B?eW91IHVuZGVyc3RhbmQgdGhlIGV4YW1wbGUu?=
+MIME-Version: 1.0
+Content-type: text/plain
+Content-transfer-encoding: 7bit
+
+Examples taken from RFC 2047.
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted?rev=1040992&view=auto
==============================================================================
--- tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted (added)
+++ tika/trunk/tika-parsers/src/test/resources/test-documents/testRFC822_quoted Wed Dec 1 13:00:57 2010
@@ -0,0 +1,13 @@
+Delivered-To: a.person@example.com
+Return-Path: <an...@another-example.com>
+MIME-Version: 1.0
+Date: Fri, 26 Nov 2010 19:57:53 +0000
+Subject: Sample with Quoted Printable Text
+From: Another Person <an...@another-example.com>
+To: A. Person <a....@example.com>
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: quoted-printable
+
+D=C3=BCsseldorf has non-ascii. Lines can be spl=
+it like this. Spaces at the end of a line=20
+must be encoded.