You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by lf...@apache.org on 2020/11/26 21:16:19 UTC

[tika] branch branch_1x updated: TIKA-3004: Fix parsing of emails attached to other emails in PST files

This is an automated email from the ASF dual-hosted git repository.

lfcnassif pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new a4018a9  TIKA-3004: Fix parsing of emails attached to other emails in PST files
a4018a9 is described below

commit a4018a98a62c1adf19fc850e50945f9deaab13c1
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Thu Nov 26 18:14:18 2020 -0300

    TIKA-3004: Fix parsing of emails attached to other emails in PST files
---
 CHANGES.txt                                        |  2 +
 .../apache/tika/parser/mbox/OutlookPSTParser.java  | 45 ++++++++++++++--------
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index c620efc..caf5628 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,8 @@
 Release 1.26 - xx/xx/xxxx
 
    * Great optimization in ForkParser (TIKA-3237).
+   
+   * Fix parsing of emails attached to other emails in PST files (TIKA-3004).
 
 Release 1.25 - 11/25/2020
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
index e23496e..af1fbc6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
@@ -115,23 +115,7 @@ public class OutlookPSTParser extends AbstractParser {
         if (pstFolder.getContentCount() > 0) {
             PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
             while (pstMail != null) {
-                AttributesImpl attributes = new AttributesImpl();
-                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-                attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
-                handler.startElement("div", attributes);
-                handler.element("h1", pstMail.getSubject());
-
-                final Metadata mailMetadata = new Metadata();
-                //parse attachments first so that stream exceptions
-                //in attachments can make it into mailMetadata.
-                //RecursiveParserWrapper copies the metadata and thereby prevents
-                //modifications to mailMetadata from making it into the
-                //metadata objects cached by the RecursiveParserWrapper
-                parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
-                parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);
-
-                handler.endElement("div");
-
+                parseMailAndAttachments(handler, pstMail, embeddedExtractor);
                 pstMail = (PSTMessage) pstFolder.getNextChild();
             }
         }
@@ -145,6 +129,27 @@ public class OutlookPSTParser extends AbstractParser {
             }
         }
     }
+    
+    private void parseMailAndAttachments(XHTMLContentHandler handler, PSTMessage pstMail,
+            EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException, TikaException {
+        
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+        attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+        handler.startElement("div", attributes);
+        handler.element("h1", pstMail.getSubject());
+
+        final Metadata mailMetadata = new Metadata();
+        //parse attachments first so that stream exceptions
+        //in attachments can make it into mailMetadata.
+        //RecursiveParserWrapper copies the metadata and thereby prevents
+        //modifications to mailMetadata from making it into the
+        //metadata objects cached by the RecursiveParserWrapper
+        parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
+        parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);
+
+        handler.endElement("div");
+    }
 
     private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, Metadata mailMetadata,
                                 EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
@@ -230,6 +235,12 @@ public class OutlookPSTParser extends AbstractParser {
         for (int i = 0; i < numberOfAttachments; i++) {
             try {
                 PSTAttachment attach = email.getAttachment(i);
+                
+                PSTMessage attachedEmail = attach.getEmbeddedPSTMessage();
+                if(attachedEmail != null) {
+                    parseMailAndAttachments(xhtml, attachedEmail, embeddedExtractor);
+                    continue;
+                }
 
                 // Get the filename; both long and short filenames can be used for attachments
                 String filename = attach.getLongFilename();