You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/08 19:10:46 UTC

[tika] branch master updated: TIKA-2530 -- temporary workaround -- check for zero length byte array in rtf body to avoid buffer underflow from POI, via Pascal Essiembre.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 8ecfeef  TIKA-2530 -- temporary workaround -- check for zero length byte array in rtf body to avoid buffer underflow from POI, via Pascal Essiembre.
8ecfeef is described below

commit 8ecfeeff86e991fa9b6c3fa12b89ec96aef00684
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Mar 8 14:10:36 2018 -0500

    TIKA-2530 -- temporary workaround -- check for zero length byte array in
    rtf body to avoid buffer underflow from POI, via Pascal Essiembre.
---
 .../tika/parser/microsoft/OutlookExtractor.java    | 30 +++++++++++++---------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index e8ce7f0..a9a6090 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -74,6 +74,7 @@ import org.apache.tika.parser.txt.CharsetMatch;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.bouncycastle.cms.Recipient;
 import org.xml.sax.SAXException;
 
 /**
@@ -321,19 +322,24 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
         }
         if (rtfChunk != null && (extractAllAlternatives || !doneBody)) {
             ByteChunk chunk = (ByteChunk) rtfChunk;
-            MAPIRtfAttribute rtf = new MAPIRtfAttribute(
-                    MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
-            );
-            Parser rtfParser =
-                    EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
-            if (rtfParser == null) {
-                rtfParser = new RTFParser();
+            //avoid buffer underflow TIKA-2530
+            //TODO -- would be good to find an example triggering file and
+            //figure out if this is a bug in POI or a genuine 0 length chunk
+            if (chunk.getValue() != null && chunk.getValue().length > 0) {
+                MAPIRtfAttribute rtf = new MAPIRtfAttribute(
+                        MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue()
+                );
+                Parser rtfParser =
+                        EmbeddedDocumentUtil.tryToFindExistingLeafParser(RTFParser.class, parseContext);
+                if (rtfParser == null) {
+                    rtfParser = new RTFParser();
+                }
+                rtfParser.parse(
+                        new ByteArrayInputStream(rtf.getData()),
+                        new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+                        new Metadata(), parseContext);
+                doneBody = true;
             }
-            rtfParser.parse(
-                    new ByteArrayInputStream(rtf.getData()),
-                    new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
-                    new Metadata(), parseContext);
-            doneBody = true;
         }
         if (textChunk != null && (extractAllAlternatives || !doneBody)) {
             xhtml.element("p", ((StringChunk) textChunk).getValue());

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.