You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by lf...@apache.org on 2020/11/26 21:16:19 UTC
[tika] branch branch_1x updated: TIKA-3004: Fix parsing of emails
attached to other emails in PST files
This is an automated email from the ASF dual-hosted git repository.
lfcnassif pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new a4018a9 TIKA-3004: Fix parsing of emails attached to other emails in PST files
a4018a9 is described below
commit a4018a98a62c1adf19fc850e50945f9deaab13c1
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Thu Nov 26 18:14:18 2020 -0300
TIKA-3004: Fix parsing of emails attached to other emails in PST files
---
CHANGES.txt | 2 +
.../apache/tika/parser/mbox/OutlookPSTParser.java | 45 ++++++++++++++--------
2 files changed, 30 insertions(+), 17 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index c620efc..caf5628 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,8 @@
Release 1.26 - xx/xx/xxxx
* Great optimization in ForkParser (TIKA-3237).
+
+ * Fix parsing of emails attached to other emails in PST files (TIKA-3004).
Release 1.25 - 11/25/2020
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
index e23496e..af1fbc6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
@@ -115,23 +115,7 @@ public class OutlookPSTParser extends AbstractParser {
if (pstFolder.getContentCount() > 0) {
PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
while (pstMail != null) {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
- handler.startElement("div", attributes);
- handler.element("h1", pstMail.getSubject());
-
- final Metadata mailMetadata = new Metadata();
- //parse attachments first so that stream exceptions
- //in attachments can make it into mailMetadata.
- //RecursiveParserWrapper copies the metadata and thereby prevents
- //modifications to mailMetadata from making it into the
- //metadata objects cached by the RecursiveParserWrapper
- parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
- parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);
-
- handler.endElement("div");
-
+ parseMailAndAttachments(handler, pstMail, embeddedExtractor);
pstMail = (PSTMessage) pstFolder.getNextChild();
}
}
@@ -145,6 +129,27 @@ public class OutlookPSTParser extends AbstractParser {
}
}
}
+
+ private void parseMailAndAttachments(XHTMLContentHandler handler, PSTMessage pstMail,
+ EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException, TikaException {
+
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+ handler.startElement("div", attributes);
+ handler.element("h1", pstMail.getSubject());
+
+ final Metadata mailMetadata = new Metadata();
+ //parse attachments first so that stream exceptions
+ //in attachments can make it into mailMetadata.
+ //RecursiveParserWrapper copies the metadata and thereby prevents
+ //modifications to mailMetadata from making it into the
+ //metadata objects cached by the RecursiveParserWrapper
+ parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
+ parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);
+
+ handler.endElement("div");
+ }
private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, Metadata mailMetadata,
EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
@@ -230,6 +235,12 @@ public class OutlookPSTParser extends AbstractParser {
for (int i = 0; i < numberOfAttachments; i++) {
try {
PSTAttachment attach = email.getAttachment(i);
+
+ PSTMessage attachedEmail = attach.getEmbeddedPSTMessage();
+ if(attachedEmail != null) {
+ parseMailAndAttachments(xhtml, attachedEmail, embeddedExtractor);
+ continue;
+ }
// Get the filename; both long and short filenames can be used for attachments
String filename = attach.getLongFilename();