You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/20 01:20:15 UTC

[tika] 02/03: TIKA-2331 -- Upgrade RTFParser to use new TikaMemoryLimitException

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9e89b442bd2b211c328eb563e42ed902f9e0ae6e
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Apr 19 21:19:46 2017 -0400

    TIKA-2331 -- Upgrade RTFParser to use new TikaMemoryLimitException
---
 .../apache/tika/parser/rtf/RTFEmbObjHandler.java   | 14 +++++++---
 .../java/org/apache/tika/parser/rtf/RTFParser.java | 30 +++++++++++++++++++++-
 .../org/apache/tika/parser/rtf/TextExtractor.java  |  6 ++---
 .../org/apache/tika/parser/rtf/RTFParserTest.java  | 15 +++++++++++
 .../org/apache/tika/parser/rtf/tika-config.xml     | 26 +++++++++++++++++++
 5 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
index 5e2ab25..42900fc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
@@ -24,6 +24,7 @@ import java.util.concurrent.atomic.AtomicInteger;
 import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -70,11 +71,13 @@ class RTFEmbObjHandler {
     private StringBuilder sb = new StringBuilder();
     private Metadata metadata;
     private EMB_STATE state = EMB_STATE.NADA;
+    private final int memoryLimitInKb;
 
-    protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
+    protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context, int memoryLimitInKb) {
         this.handler = handler;
         this.embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
         os = new ByteArrayOutputStream();
+        this.memoryLimitInKb = memoryLimitInKb;
     }
 
     protected void startPict() {
@@ -145,8 +148,13 @@ class RTFEmbObjHandler {
     }
 
     protected void writeBytes(InputStream is, int len) throws IOException, TikaException {
-        if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
-            throw new IOException("length of bytes to read out of bounds: " + len);
+        if (len < 0) {
+            throw new TikaException("Requesting I read < 0 bytes ?!");
+        }
+        if (len > memoryLimitInKb) {
+            throw new TikaMemoryLimitException("File embedded in RTF caused this (" + len +
+                    ") bytes), but maximum allowed is ("+memoryLimitInKb+")."+
+                    "If this is a valid RTF file, consider increasing the memory limit via TikaConfig.");
         }
 
         byte[] bytes = new byte[len];
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
index d2c448b..567a7a8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
@@ -22,6 +22,7 @@ import java.util.Collections;
 import java.util.Set;
 
 import org.apache.commons.io.input.TaggedInputStream;
+import org.apache.tika.config.Field;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -53,6 +54,7 @@ public class RTFParser extends AbstractParser {
      *
      * @return maximum number of bytes allowed for an embedded object.
      */
+    @Deprecated
     public static int getMaxBytesForEmbeddedObject() {
         return EMB_OBJ_MAX_BYTES;
     }
@@ -65,15 +67,24 @@ public class RTFParser extends AbstractParser {
      *
      * @param max maximum number of bytes to allow for embedded objects.  If
      *            the embedded object has more than this number of bytes, skip it.
+     * @deprecated use {@link #setMemoryLimitInKb(int)} instead
      */
+    @Deprecated
     public static void setMaxBytesForEmbeddedObject(int max) {
         EMB_OBJ_MAX_BYTES = max;
+        USE_STATIC = true;
     }
 
+    //get rid of this once we get rid of the other static maxbytes...
+    private static volatile boolean USE_STATIC = false;
+
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
 
+    @Field
+    private int memoryLimitInKb = EMB_OBJ_MAX_BYTES;
+
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
@@ -82,7 +93,7 @@ public class RTFParser extends AbstractParser {
         TaggedInputStream tagged = new TaggedInputStream(stream);
         try {
             XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
-            RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context);
+            RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context, getMemoryLimitInKb());
             final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
             ert.extract(stream);
         } catch (IOException e) {
@@ -90,4 +101,21 @@ public class RTFParser extends AbstractParser {
             throw new TikaException("Error parsing an RTF document", e);
         }
     }
+
+    @Field
+    public void setMemoryLimitInKb(int memoryLimitInKb) {
+        this.memoryLimitInKb = memoryLimitInKb;
+        USE_STATIC = false;
+    }
+
+    private int getMemoryLimitInKb() {
+        //there's a race condition here, but it shouldn't matter.
+        if (USE_STATIC) {
+            if (EMB_OBJ_MAX_BYTES < 0) {
+                return EMB_OBJ_MAX_BYTES;
+            }
+            return EMB_OBJ_MAX_BYTES/1024;
+        }
+        return memoryLimitInKb;
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index 8ba8961..b07a3a0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -947,10 +947,8 @@ final class TextExtractor {
                 if (groupState.pictDepth == 1) {
                     try {
                         embObjHandler.writeBytes(in, param);
-                    } catch (IOException e) {
-                        //param was out of bounds or something went wrong during writing.
-                        //skip this obj and move on
-                        //TODO: log.warn
+                    } catch (IOException|TikaException e) {
+                        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
                         embObjHandler.reset();
                     }
                 } else {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index b957b8c..aed6cf5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -35,6 +35,7 @@ import java.util.Set;
 import org.apache.commons.io.FilenameUtils;
 import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.extractor.ContainerExtractor;
 import org.apache.tika.extractor.ParserContainerExtractor;
 import org.apache.tika.io.TikaInputStream;
@@ -524,6 +525,20 @@ public class RTFParserTest extends TikaTest {
         assertEquals(2, tracker.filenames.size());
     }
 
+    @Test
+    public void testConfig() throws Exception {
+        //test that memory allocation of the bin element is limited
+        //via the config file.  Unfortunately, this test file's bin embedding contains 10 bytes
+        //so we had to set the config to 0.
+        InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/rtf/tika-config.xml");
+        assertNotNull(is);
+        TikaConfig tikaConfig = new TikaConfig(is);
+        Parser p = new AutoDetectParser(tikaConfig);
+        List<Metadata> metadataList = getRecursiveMetadata("testBinControlWord.rtf", p);
+        assertEquals(1, metadataList.size());
+        assertContains("TikaMemoryLimitException", metadataList.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM));
+    }
+
     private Result getResult(String filename) throws Exception {
         File file = getResourceAsFile("/test-documents/" + filename);
 
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/tika-config.xml
new file mode 100644
index 0000000..1f53a78
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/tika-config.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.rtf.RTFParser">
+            <params>
+                <param name="memoryLimitInKb" type="int">0</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.