You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/02/01 16:48:01 UTC
[tika] branch branch_1x updated: TIKA-2547: RFC822 with
multipart/mixed,
first text element should be treated as the main body of the email,
not an attachment.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 6829643 TIKA-2547: RFC822 with multipart/mixed, first text element should be treated as the main body of the email, not an attachment.
6829643 is described below
commit 68296437b23052ebc4415a9cec9aadc14141f634
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Jan 31 13:33:08 2018 -0500
TIKA-2547: RFC822 with multipart/mixed, first text element should be treated as the main body of the email, not an attachment.
---
CHANGES.txt | 8 ++++
.../tika/parser/mail/MailContentHandler.java | 52 ++++++++++++++++++----
.../apache/tika/parser/mail/RFC822ParserTest.java | 19 ++++++++
.../resources/test-documents/testRFC822-txt-body | 35 +++++++++++++++
4 files changed, 106 insertions(+), 8 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 1399520..b1ca828 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,7 +1,15 @@
Release 1.18 - ???
+
+ * RFC822 with multipart/mixed, first text element should be treated
+ as the main body of the email, not an attachment (TIKA-2547).
+
* Swap out com.tdunning:json for com.github.openjson:openjson to avoid
jar conflicts (TIKA-2556).
+ * No longer hardcode HtmlParser for XML files in tika-server (TIKA-2551).
+
+ * Require Java 8 (TIKA-2553).
+
* Add a parser for XPS (TIKA-2524).
* Mime magic for Dolby Digital AC3 and EAC3 files
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 40db8f3..ddc32b8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -33,6 +33,7 @@ import org.apache.james.mime4j.message.MaximalBodyDescriptor;
import org.apache.james.mime4j.parser.ContentHandler;
import org.apache.james.mime4j.stream.BodyDescriptor;
import org.apache.james.mime4j.stream.Field;
+import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -147,6 +148,7 @@ class MailContentHandler implements ContentHandler {
private boolean strictParsing = false;
private final boolean extractAllAlternatives;
private final EmbeddedDocumentExtractor extractor;
+ private final Detector detector;
//this is used to buffer a multipart body that
//keeps track of multipart/alternative and its children
@@ -167,6 +169,7 @@ class MailContentHandler implements ContentHandler {
// Was an EmbeddedDocumentExtractor explicitly supplied?
this.extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ this.detector = new EmbeddedDocumentUtil(context).getDetector();
}
@Override
@@ -184,16 +187,16 @@ class MailContentHandler implements ContentHandler {
if (parts.size() > 0) {
submd.set(Message.MULTIPART_SUBTYPE, parts.peek().getSubType());
submd.set(Message.MULTIPART_BOUNDARY, parts.peek().getBoundary());
- }
+ }
if (body instanceof MaximalBodyDescriptor) {
MaximalBodyDescriptor maximalBody = (MaximalBodyDescriptor) body;
String contentDispositionType = maximalBody.getContentDispositionType();
if (contentDispositionType != null && !contentDispositionType.isEmpty()) {
- StringBuilder contentDisposition = new StringBuilder( contentDispositionType );
+ StringBuilder contentDisposition = new StringBuilder(contentDispositionType);
Map<String, String> contentDispositionParameters = maximalBody.getContentDispositionParameters();
- for ( Entry<String, String> param : contentDispositionParameters.entrySet() ) {
+ for (Entry<String, String> param : contentDispositionParameters.entrySet()) {
contentDisposition.append("; ")
- .append(param.getKey()).append("=\"").append(param.getValue()).append('"');
+ .append(param.getKey()).append("=\"").append(param.getValue()).append('"');
}
String contentDispositionFileName = maximalBody.getContentDispositionFilename();
@@ -201,15 +204,31 @@ class MailContentHandler implements ContentHandler {
submd.set( Metadata.RESOURCE_NAME_KEY, contentDispositionFileName );
}
- submd.set( Metadata.CONTENT_DISPOSITION, contentDisposition.toString() );
+ submd.set(Metadata.CONTENT_DISPOSITION, contentDisposition.toString());
}
}
//if we're in a multipart/alternative or any one of its children
//add the bodypart to the latest that was added
- if (! extractAllAlternatives && alternativePartBuffer.size() > 0) {
+ if (!extractAllAlternatives && alternativePartBuffer.size() > 0) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
alternativePartBuffer.peek().children.add(new BodyContents(submd, bos.toByteArray()));
+ } else if (!extractAllAlternatives && parts.size() == 1) {
+ //if you're at the first level of embedding
+ //and you're not in an alternative part block
+ //and you're text/html, put that in the body of the email
+ //otherwise treat as a regular attachment
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(is, bos);
+ byte[] bytes = bos.toByteArray();
+ if (isTextOrHtml(submd, bytes)) {
+ handleInlineBodyPart(new BodyContents(submd, bos.toByteArray()));
+ } else {
+ //else handle as you would any other embedded content
+ try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+ handleEmbedded(tis, submd);
+ }
+ }
} else {
//else handle as you would any other embedded content
try (TikaInputStream tis = TikaInputStream.get(is)) {
@@ -218,6 +237,22 @@ class MailContentHandler implements ContentHandler {
}
}
+ private boolean isTextOrHtml(Metadata submd, byte[] bytes) {
+ String mediaTypeString = submd.get(Metadata.CONTENT_TYPE);
+ if (mediaTypeString != null && mediaTypeString.startsWith("text")) {
+ return true;
+ }
+ try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+ MediaType mediaType = detector.detect(tis, submd);
+ if (mediaType != null && mediaType.toString().startsWith("text")) {
+ return true;
+ }
+ } catch (IOException e) {
+
+ }
+ return false;
+ }
+
private void handleEmbedded(TikaInputStream tis, Metadata metadata) throws MimeException, IOException {
String disposition = metadata.get(Metadata.CONTENT_DISPOSITION);
@@ -516,7 +551,7 @@ class MailContentHandler implements ContentHandler {
}
if (part instanceof BodyContents) {
- handlePart((BodyContents)part);
+ handleInlineBodyPart((BodyContents)part);
return;
}
@@ -539,7 +574,7 @@ class MailContentHandler implements ContentHandler {
}
}
- private void handlePart(BodyContents part) throws MimeException, IOException {
+ private void handleInlineBodyPart(BodyContents part) throws MimeException, IOException {
String contentType = part.metadata.get(Metadata.CONTENT_TYPE);
Parser parser = null;
if (MediaType.TEXT_HTML.toString().equalsIgnoreCase(contentType)) {
@@ -555,6 +590,7 @@ class MailContentHandler implements ContentHandler {
if (parser == null) {
+ //back off and treat it as an embedded chunk
try (TikaInputStream tis = TikaInputStream.get(part.bytes)) {
handleEmbedded(tis, part.metadata);
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 7b48f13..0e8c237 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -260,6 +260,25 @@ public class RFC822ParserTest extends TikaTest {
metadata.get(Metadata.SUBJECT));
}
+ @Test
+ public void testMainBody() throws Exception {
+ //test that the first text or html chunk is processed in the main body
+ //not treated as an attachment. TIKA-2547
+ List<Metadata> metadataList = getRecursiveMetadata("testRFC822_oddfrom");
+ assertEquals(7, metadataList.size());
+ assertContains("Air Quality Planning", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+ //Make sure text alternative doesn't get treated as an attachment
+ metadataList = getRecursiveMetadata("testRFC822_normal_zip");
+ assertEquals(3, metadataList.size());
+ assertContains("This is the HTML part", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("application/zip", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+
+ metadataList = getRecursiveMetadata("testRFC822-txt-body");
+ assertEquals(2, metadataList.size());
+ assertContains("body 1", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
+
/**
* Test for TIKA-640, increase header max beyond 10k bytes
*/
diff --git a/tika-parsers/src/test/resources/test-documents/testRFC822-txt-body b/tika-parsers/src/test/resources/test-documents/testRFC822-txt-body
new file mode 100644
index 0000000..de28397
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testRFC822-txt-body
@@ -0,0 +1,35 @@
+MIME-Version: 1.0
+Received: by 10.103.33.199 with HTTP; Tue, 6 Jun 2017 14:48:27 -0700 (PDT)
+Bcc: emailtosalesforce@r-kub1lq8760pccrdt39x94qxtajhk3q4zb1fzikf15ygnugofn.6a-euhkuaa.na50.le.salesforce.com
+Date: Tue, 6 Jun 2017 14:48:27 -0700
+Delivered-To: john.doe@gmail.com
+Message-ID: <CA...@mail.gmail.com>
+Subject: Test BCCing email (rev 2)
+From: John Doe <jo...@gmail.com>
+To: john.smith@domain.com
+Content-Type: multipart/mixed; boundary="94eb2c03266668996305515194b6"
+
+This is a multipart message in MIME format.
+
+--94eb2c03266668996305515194b6
+Content-Type: text/plain; charset="UTF-8"
+Content-Transfer-Encoding: quoted-printable
+
+This is an email that will have some rich text and an attachment.
+
+*Because I've added some bold text here.*
+
+body 1
+*=E2=80=8B*
+*And here's some more text (still bold)*
+
+-- John
+
+--94eb2c03266668996305515194b6
+Content-Type: image/jpeg; name="mary-coffee.jpg"
+Content-Disposition: attachment; filename="mary-coffee.jpg"
+Content-Transfer-Encoding: base64
+X-Attachment-Id: f_j3m3jfpq1
+
+
+--94eb2c03266668996305515194b6--
\ No newline at end of file
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.