You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/11/13 15:29:07 UTC

[tika] branch master updated: TIKA-2488 -- catch potential npe in getting attachment's inputstream

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 9c2e1b9  TIKA-2488 -- catch potential npe in getting attachment's inputstream
9c2e1b9 is described below

commit 9c2e1b9d839cffbc0820b789ffdd17a9a0b10759
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Nov 13 10:28:57 2017 -0500

    TIKA-2488 -- catch potential npe in getting attachment's inputstream
---
 CHANGES.txt                                        |  3 ++
 .../apache/tika/parser/mbox/OutlookPSTParser.java  | 35 ++++++++++++++--------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 7cd9784..8cb683c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.17 - ???
 
+  * Catch potential NPE in getting InputStream for attachments
+    in PST file (TIKA-2488).
+
   * Upgrade to PDFBox 2.0.8 (TIKA-2489).
 
   * Allow configuration of markLimit in EncodingDetectors
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
index 7a6ada8..61d7bac 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/mbox/OutlookPSTParser.java
@@ -125,8 +125,14 @@ public class OutlookPSTParser extends AbstractParser {
                 handler.startElement("div", attributes);
                 handler.element("h1", pstMail.getSubject());
 
-                parserMailItem(handler, pstMail, embeddedExtractor);
-                parseMailAttachments(handler, pstMail, embeddedExtractor);
+                final Metadata mailMetadata = new Metadata();
+                //parse attachments first so that stream exceptions
+                //in attachments can make it into mailMetadata.
+                //RecursiveParserWrapper copies the metadata and thereby prevents
+                //modifications to mailMetadata from making it into the
+                //metadata objects cached by the RecursiveParserWrapper
+                parseMailAttachments(handler, pstMail, mailMetadata, embeddedExtractor);
+                parserMailItem(handler, pstMail, mailMetadata, embeddedExtractor);
 
                 handler.endElement("div");
 
@@ -144,8 +150,8 @@ public class OutlookPSTParser extends AbstractParser {
         }
     }
 
-    private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
-        Metadata mailMetadata = new Metadata();
+    private void parserMailItem(XHTMLContentHandler handler, PSTMessage pstMail, Metadata mailMetadata,
+                                EmbeddedDocumentExtractor embeddedExtractor) throws SAXException, IOException {
         mailMetadata.set(Metadata.RESOURCE_NAME_KEY, pstMail.getInternetMessageId());
         mailMetadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pstMail.getInternetMessageId());
         mailMetadata.set(TikaCoreProperties.IDENTIFIER, pstMail.getInternetMessageId());
@@ -217,11 +223,12 @@ public class OutlookPSTParser extends AbstractParser {
         embeddedExtractor.parseEmbedded(new ByteArrayInputStream(mailContent), handler, mailMetadata, true);
     }
 
-    private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email, EmbeddedDocumentExtractor embeddedExtractor)
+    private void parseMailAttachments(XHTMLContentHandler xhtml, PSTMessage email,
+                                     final Metadata mailMetadata,
+                                      EmbeddedDocumentExtractor embeddedExtractor)
             throws TikaException {
         int numberOfAttachments = email.getNumberOfAttachments();
         for (int i = 0; i < numberOfAttachments; i++) {
-            File tempFile = null;
             try {
                 PSTAttachment attach = email.getAttachment(i);
 
@@ -241,21 +248,25 @@ public class OutlookPSTParser extends AbstractParser {
                 attributes.addAttribute("", "id", "id", "CDATA", filename);
                 xhtml.startElement("div", attributes);
                 if (embeddedExtractor.shouldParseEmbedded(attachMeta)) {
-                    TemporaryResources tmp = new TemporaryResources();
+                    TikaInputStream tis = null;
+                    try {
+                        tis = TikaInputStream.get(attach.getFileInputStream());
+                    } catch (NullPointerException e) {//TIKA-2488
+                        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, mailMetadata);
+                        continue;
+                    }
+
                     try {
-                        TikaInputStream tis = TikaInputStream.get(attach.getFileInputStream(), tmp);
                         embeddedExtractor.parseEmbedded(tis, xhtml, attachMeta, true);
                     } finally {
-                        tmp.dispose();
+
+                        tis.close();
                     }
                 }
                 xhtml.endElement("div");
 
             } catch (Exception e) {
                 throw new TikaException("Unable to unpack document stream", e);
-            } finally {
-                if (tempFile != null)
-                    tempFile.delete();
             }
         }
     }

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].