You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by lf...@apache.org on 2020/11/26 22:28:19 UTC

[tika] 01/02: TIKA-3004: Fix parsing of emails attached to other emails in PST files

This is an automated email from the ASF dual-hosted git repository.

lfcnassif pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 02bf521ba11f22d5de636c7de41fe8643497246a
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Thu Nov 26 18:14:18 2020 -0300

    TIKA-3004: Fix parsing of emails attached to other emails in PST files
---
 CHANGES.txt                                        |  2 +
 .../parser/microsoft/pst/OutlookPSTParser.java     | 57 +++++++++++++---------
 2 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 6591f41..faf8913 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -16,6 +16,8 @@ Release 2.0.0 - ???
 Release 1.26 - ???
 
    * Great optimization in ForkParser (TIKA-3237).
+   
+   * Fix parsing of emails attached to other emails in PST files (TIKA-3004).
 
 Release 1.25 - 11/25/2020
 
diff --git a/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java b/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
index e90077d..360e4a2 100644
--- a/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
+++ b/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
@@ -25,12 +25,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.Set;
 
-import com.pff.PSTAttachment;
-import com.pff.PSTException;
-import com.pff.PSTFile;
-import com.pff.PSTFolder;
-import com.pff.PSTMessage;
-import com.pff.PSTRecipient;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -48,6 +42,13 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
+import com.pff.PSTAttachment;
+import com.pff.PSTException;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import com.pff.PSTRecipient;
+
 /**
  * Parser for MS Outlook PST email storage files
  */
@@ -115,23 +116,7 @@ public class OutlookPSTParser extends AbstractParser {
         if (pstFolder.getContentCount() > 0) {
             PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
             while (pstMail != null) {
-                AttributesImpl attributes = new AttributesImpl();
-                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-                attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
-                handler.startElement("div", attributes);
-                handler.element("h1", pstMail.getSubject());
-
-                final Metadata mailMetadata = new Metadata();
-                //parse attachments first so that stream exceptions
-                //in attachments can make it into mailMetadata.
-                //RecursiveParserWrapper copies the metadata and thereby prevents
-                //modifications to mailMetadata from making it into the
-                //metadata objects cached by the RecursiveParserWrapper
-                parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
-                parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);
-
-                handler.endElement("div");
-
+                parseMailAndAttachments(handler, pstMail, embeddedExtractor);
                 pstMail = (PSTMessage) pstFolder.getNextChild();
             }
         }
@@ -146,6 +131,26 @@ public class OutlookPSTParser extends AbstractParser {
         }
     }
 
+    private void parseMailAndAttachments(XHTMLContentHandler handler, PSTMessage pstMail,
+            EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException, TikaException {
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+        attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+        handler.startElement("div", attributes);
+        handler.element("h1", pstMail.getSubject());
+
+        final Metadata mailMetadata = new Metadata();
+        // parse attachments first so that stream exceptions
+        // in attachments can make it into mailMetadata.
+        // RecursiveParserWrapper copies the metadata and thereby prevents
+        // modifications to mailMetadata from making it into the
+        // metadata objects cached by the RecursiveParserWrapper
+        parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
+        parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);
+
+        handler.endElement("div");
+    }
+
     private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, Metadata mailMetadata,
                                 EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
         mailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
@@ -231,6 +236,12 @@ public class OutlookPSTParser extends AbstractParser {
             try {
                 PSTAttachment attach = email.getAttachment(i);
 
+                PSTMessage attachedEmail = attach.getEmbeddedPSTMessage();
+                if (attachedEmail != null) {
+                    parseMailAndAttachments(xhtml, attachedEmail, embeddedExtractor);
+                    continue;
+                }
+
                 // Get the filename; both long and short filenames can be used for attachments
                 String filename = attach.getLongFilename();
                 if (filename.isEmpty()) {