You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by lf...@apache.org on 2020/11/26 22:28:19 UTC
[tika] 01/02: TIKA-3004: Fix parsing of emails attached to other
emails in PST files
This is an automated email from the ASF dual-hosted git repository.
lfcnassif pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 02bf521ba11f22d5de636c7de41fe8643497246a
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Thu Nov 26 18:14:18 2020 -0300
TIKA-3004: Fix parsing of emails attached to other emails in PST files
---
CHANGES.txt | 2 +
.../parser/microsoft/pst/OutlookPSTParser.java | 57 +++++++++++++---------
2 files changed, 36 insertions(+), 23 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 6591f41..faf8913 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -16,6 +16,8 @@ Release 2.0.0 - ???
Release 1.26 - ???
* Great optimization in ForkParser (TIKA-3237).
+
+ * Fix parsing of emails attached to other emails in PST files (TIKA-3004).
Release 1.25 - 11/25/2020
diff --git a/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java b/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
index e90077d..360e4a2 100644
--- a/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
+++ b/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
@@ -25,12 +25,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
-import com.pff.PSTAttachment;
-import com.pff.PSTException;
-import com.pff.PSTFile;
-import com.pff.PSTFolder;
-import com.pff.PSTMessage;
-import com.pff.PSTRecipient;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -48,6 +42,13 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
+import com.pff.PSTAttachment;
+import com.pff.PSTException;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import com.pff.PSTRecipient;
+
/**
* Parser for MS Outlook PST email storage files
*/
@@ -115,23 +116,7 @@ public class OutlookPSTParser extends AbstractParser {
if (pstFolder.getContentCount() > 0) {
PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
while (pstMail != null) {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "class", "class", "CDATA", "embedded");
- attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
- handler.startElement("div", attributes);
- handler.element("h1", pstMail.getSubject());
-
- final Metadata mailMetadata = new Metadata();
- //parse attachments first so that stream exceptions
- //in attachments can make it into mailMetadata.
- //RecursiveParserWrapper copies the metadata and thereby prevents
- //modifications to mailMetadata from making it into the
- //metadata objects cached by the RecursiveParserWrapper
- parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
- parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);
-
- handler.endElement("div");
-
+ parseMailAndAttachments(handler, pstMail, embeddedExtractor);
pstMail = (PSTMessage) pstFolder.getNextChild();
}
}
@@ -146,6 +131,26 @@ public class OutlookPSTParser extends AbstractParser {
}
}
+ private void parseMailAndAttachments(XHTMLContentHandler handler, PSTMessage pstMail,
+ EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException, TikaException {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+ attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+ handler.startElement("div", attributes);
+ handler.element("h1", pstMail.getSubject());
+
+ final Metadata mailMetadata = new Metadata();
+ // parse attachments first so that stream exceptions
+ // in attachments can make it into mailMetadata.
+ // RecursiveParserWrapper copies the metadata and thereby prevents
+ // modifications to mailMetadata from making it into the
+ // metadata objects cached by the RecursiveParserWrapper
+ parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
+ parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);
+
+ handler.endElement("div");
+ }
+
private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, Metadata mailMetadata,
EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
mailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
@@ -231,6 +236,12 @@ public class OutlookPSTParser extends AbstractParser {
try {
PSTAttachment attach = email.getAttachment(i);
+ PSTMessage attachedEmail = attach.getEmbeddedPSTMessage();
+ if (attachedEmail != null) {
+ parseMailAndAttachments(xhtml, attachedEmail, embeddedExtractor);
+ continue;
+ }
+
// Get the filename; both long and short filenames can be used for attachments
String filename = attach.getLongFilename();
if (filename.isEmpty()) {