You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/20 01:20:15 UTC
[tika] 02/03: TIKA-2331 -- Upgrade RTFParser to use new
TikaMemoryLimitException
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9e89b442bd2b211c328eb563e42ed902f9e0ae6e
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Apr 19 21:19:46 2017 -0400
TIKA-2331 -- Upgrade RTFParser to use new TikaMemoryLimitException
---
.../apache/tika/parser/rtf/RTFEmbObjHandler.java | 14 +++++++---
.../java/org/apache/tika/parser/rtf/RTFParser.java | 30 +++++++++++++++++++++-
.../org/apache/tika/parser/rtf/TextExtractor.java | 6 ++---
.../org/apache/tika/parser/rtf/RTFParserTest.java | 15 +++++++++++
.../org/apache/tika/parser/rtf/tika-config.xml | 26 +++++++++++++++++++
5 files changed, 83 insertions(+), 8 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
index 5e2ab25..42900fc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
@@ -24,6 +24,7 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -70,11 +71,13 @@ class RTFEmbObjHandler {
private StringBuilder sb = new StringBuilder();
private Metadata metadata;
private EMB_STATE state = EMB_STATE.NADA;
+ private final int memoryLimitInKb;
- protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context) {
+ protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, ParseContext context, int memoryLimitInKb) {
this.handler = handler;
this.embeddedDocumentUtil = new EmbeddedDocumentUtil(context);
os = new ByteArrayOutputStream();
+ this.memoryLimitInKb = memoryLimitInKb;
}
protected void startPict() {
@@ -145,8 +148,13 @@ class RTFEmbObjHandler {
}
protected void writeBytes(InputStream is, int len) throws IOException, TikaException {
- if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
- throw new IOException("length of bytes to read out of bounds: " + len);
+ if (len < 0) {
+ throw new TikaException("Requesting I read < 0 bytes ?!");
+ }
+ if (len > memoryLimitInKb) {
+ throw new TikaMemoryLimitException("File embedded in RTF caused this (" + len +
+ ") bytes), but maximum allowed is ("+memoryLimitInKb+")."+
+ "If this is a valid RTF file, consider increasing the memory limit via TikaConfig.");
}
byte[] bytes = new byte[len];
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
index d2c448b..567a7a8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
@@ -22,6 +22,7 @@ import java.util.Collections;
import java.util.Set;
import org.apache.commons.io.input.TaggedInputStream;
+import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -53,6 +54,7 @@ public class RTFParser extends AbstractParser {
*
* @return maximum number of bytes allowed for an embedded object.
*/
+ @Deprecated
public static int getMaxBytesForEmbeddedObject() {
return EMB_OBJ_MAX_BYTES;
}
@@ -65,15 +67,24 @@ public class RTFParser extends AbstractParser {
*
* @param max maximum number of bytes to allow for embedded objects. If
* the embedded object has more than this number of bytes, skip it.
+ * @deprecated use {@link #setMemoryLimitInKb(int)} instead
*/
+ @Deprecated
public static void setMaxBytesForEmbeddedObject(int max) {
EMB_OBJ_MAX_BYTES = max;
+ USE_STATIC = true;
}
+ //get rid of this once we get rid of the other static maxbytes...
+ private static volatile boolean USE_STATIC = false;
+
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
+ @Field
+ private int memoryLimitInKb = EMB_OBJ_MAX_BYTES;
+
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
@@ -82,7 +93,7 @@ public class RTFParser extends AbstractParser {
TaggedInputStream tagged = new TaggedInputStream(stream);
try {
XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
- RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context);
+ RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context, getMemoryLimitInKb());
final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
ert.extract(stream);
} catch (IOException e) {
@@ -90,4 +101,21 @@ public class RTFParser extends AbstractParser {
throw new TikaException("Error parsing an RTF document", e);
}
}
+
+ @Field
+ public void setMemoryLimitInKb(int memoryLimitInKb) {
+ this.memoryLimitInKb = memoryLimitInKb;
+ USE_STATIC = false;
+ }
+
+ private int getMemoryLimitInKb() {
+ //there's a race condition here, but it shouldn't matter.
+ if (USE_STATIC) {
+ if (EMB_OBJ_MAX_BYTES < 0) {
+ return EMB_OBJ_MAX_BYTES;
+ }
+ return EMB_OBJ_MAX_BYTES/1024;
+ }
+ return memoryLimitInKb;
+ }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index 8ba8961..b07a3a0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -947,10 +947,8 @@ final class TextExtractor {
if (groupState.pictDepth == 1) {
try {
embObjHandler.writeBytes(in, param);
- } catch (IOException e) {
- //param was out of bounds or something went wrong during writing.
- //skip this obj and move on
- //TODO: log.warn
+ } catch (IOException|TikaException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
embObjHandler.reset();
}
} else {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index b957b8c..aed6cf5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -35,6 +35,7 @@ import java.util.Set;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
@@ -524,6 +525,20 @@ public class RTFParserTest extends TikaTest {
assertEquals(2, tracker.filenames.size());
}
+ @Test
+ public void testConfig() throws Exception {
+ //test that memory allocation of the bin element is limited
+ //via the config file. Unfortunately, this test file's bin embedding contains 10 bytes
+ //so we had to set the config to 0.
+ InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/rtf/tika-config.xml");
+ assertNotNull(is);
+ TikaConfig tikaConfig = new TikaConfig(is);
+ Parser p = new AutoDetectParser(tikaConfig);
+ List<Metadata> metadataList = getRecursiveMetadata("testBinControlWord.rtf", p);
+ assertEquals(1, metadataList.size());
+ assertContains("TikaMemoryLimitException", metadataList.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM));
+ }
+
private Result getResult(String filename) throws Exception {
File file = getResourceAsFile("/test-documents/" + filename);
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/tika-config.xml
new file mode 100644
index 0000000..1f53a78
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/tika-config.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.rtf.RTFParser">
+ <params>
+ <param name="memoryLimitInKb" type="int">0</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.