You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by lf...@apache.org on 2020/11/26 22:28:18 UTC

[tika] branch main updated (d8b0d1f -> ce55ab9)

This is an automated email from the ASF dual-hosted git repository.

lfcnassif pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from d8b0d1f  TIKA-3237: great optimization in ForkParser
     new 02bf521  TIKA-3004: Fix parsing of emails attached to other emails in PST files
     new ce55ab9  TIKA-3004: update test to search for attached mail body

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   2 +
 .../parser/microsoft/pst/OutlookPSTParser.java     |  57 ++++++++++++---------
 .../parser/microsoft/pst/OutlookPSTParserTest.java |   5 +-
 .../src/test/resources/test-documents/testPST.pst  | Bin 271360 -> 2302976 bytes
 4 files changed, 39 insertions(+), 25 deletions(-)


[tika] 01/02: TIKA-3004: Fix parsing of emails attached to other emails in PST files

Posted by lf...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

lfcnassif pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 02bf521ba11f22d5de636c7de41fe8643497246a
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Thu Nov 26 18:14:18 2020 -0300

    TIKA-3004: Fix parsing of emails attached to other emails in PST files
---
 CHANGES.txt                                        |  2 +
 .../parser/microsoft/pst/OutlookPSTParser.java     | 57 +++++++++++++---------
 2 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 6591f41..faf8913 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -16,6 +16,8 @@ Release 2.0.0 - ???
 Release 1.26 - ???
 
    * Great optimization in ForkParser (TIKA-3237).
+   
+   * Fix parsing of emails attached to other emails in PST files (TIKA-3004).
 
 Release 1.25 - 11/25/2020
 
diff --git a/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java b/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
index e90077d..360e4a2 100644
--- a/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
+++ b/tika-parser-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
@@ -25,12 +25,6 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.util.Set;
 
-import com.pff.PSTAttachment;
-import com.pff.PSTException;
-import com.pff.PSTFile;
-import com.pff.PSTFolder;
-import com.pff.PSTMessage;
-import com.pff.PSTRecipient;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -48,6 +42,13 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
+import com.pff.PSTAttachment;
+import com.pff.PSTException;
+import com.pff.PSTFile;
+import com.pff.PSTFolder;
+import com.pff.PSTMessage;
+import com.pff.PSTRecipient;
+
 /**
  * Parser for MS Outlook PST email storage files
  */
@@ -115,23 +116,7 @@ public class OutlookPSTParser extends AbstractParser {
         if (pstFolder.getContentCount() > 0) {
             PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
             while (pstMail != null) {
-                AttributesImpl attributes = new AttributesImpl();
-                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
-                attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
-                handler.startElement("div", attributes);
-                handler.element("h1", pstMail.getSubject());
-
-                final Metadata mailMetadata = new Metadata();
-                //parse attachments first so that stream exceptions
-                //in attachments can make it into mailMetadata.
-                //RecursiveParserWrapper copies the metadata and thereby prevents
-                //modifications to mailMetadata from making it into the
-                //metadata objects cached by the RecursiveParserWrapper
-                parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
-                parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);
-
-                handler.endElement("div");
-
+                parseMailAndAttachments(handler, pstMail, embeddedExtractor);
                 pstMail = (PSTMessage) pstFolder.getNextChild();
             }
         }
@@ -146,6 +131,26 @@ public class OutlookPSTParser extends AbstractParser {
         }
     }
 
+    private void parseMailAndAttachments(XHTMLContentHandler handler, PSTMessage pstMail,
+            EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException, TikaException {
+        AttributesImpl attributes = new AttributesImpl();
+        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
+        attributes.addAttribute("", "id", "id", "CDATA", pstMail.getInternetMessageId());
+        handler.startElement("div", attributes);
+        handler.element("h1", pstMail.getSubject());
+
+        final Metadata mailMetadata = new Metadata();
+        // parse attachments first so that stream exceptions
+        // in attachments can make it into mailMetadata.
+        // RecursiveParserWrapper copies the metadata and thereby prevents
+        // modifications to mailMetadata from making it into the
+        // metadata objects cached by the RecursiveParserWrapper
+        parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
+        parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);
+
+        handler.endElement("div");
+    }
+
     private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, Metadata mailMetadata,
                                 EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
         mailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
@@ -231,6 +236,12 @@ public class OutlookPSTParser extends AbstractParser {
             try {
                 PSTAttachment attach = email.getAttachment(i);
 
+                PSTMessage attachedEmail = attach.getEmbeddedPSTMessage();
+                if (attachedEmail != null) {
+                    parseMailAndAttachments(xhtml, attachedEmail, embeddedExtractor);
+                    continue;
+                }
+
                 // Get the filename; both long and short filenames can be used for attachments
                 String filename = attach.getLongFilename();
                 if (filename.isEmpty()) {


[tika] 02/02: TIKA-3004: update test to search for attached mail body

Posted by lf...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

lfcnassif pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ce55ab9f9212809842629bca20259db7c2c5ef72
Author: Luis Nassif <lf...@gmail.com>
AuthorDate: Thu Nov 26 19:25:05 2020 -0300

    TIKA-3004: update test to search for attached mail body
---
 .../parser/microsoft/pst/OutlookPSTParserTest.java |   5 +++--
 .../src/test/resources/test-documents/testPST.pst  | Bin 271360 -> 2302976 bytes
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index fe1826e..90fca86 100644
--- a/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++ b/tika-parser-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -66,7 +66,7 @@ public class OutlookPSTParserTest extends TikaTest {
         String output = handler.toString();
 
         assertFalse(output.isEmpty());
-        assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">"));
+        assertTrue(output.contains("<meta name=\"Content-Length\" content=\"2302976\">"));
         assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">"));
 
         assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
@@ -76,9 +76,10 @@ public class OutlookPSTParserTest extends TikaTest {
 
         assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>"));
 
+        assertTrue(output.contains("This is a docx attachment."));
 
         List<Metadata> metaList = trackingExtrator.trackingMetadata;
-        assertEquals(6, metaList.size());
+        assertEquals(9, metaList.size());
 
         Metadata firstMail = metaList.get(0);
         assertEquals("Jörn Kottmann", firstMail.get(TikaCoreProperties.CREATOR));
diff --git a/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testPST.pst b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testPST.pst
index dc4b673..8ccc695 100644
Binary files a/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testPST.pst and b/tika-parser-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testPST.pst differ