You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/28 16:05:01 UTC
[tika] 04/05: TIKA-3372 -- fix write limit in recursive parser
handler
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit bc551b18da2fb540a360e3300660490a046b457b
Author: tballison <ta...@apache.org>
AuthorDate: Wed Apr 28 12:04:33 2021 -0400
TIKA-3372 -- fix write limit in recursive parser handler
---
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +-
.../batch/fs/RecursiveParserWrapperFSConsumer.java | 2 +-
.../apache/tika/parser/RecursiveParserWrapper.java | 15 +++++++------
.../sax/AbstractRecursiveParserWrapperHandler.java | 11 ++++++++--
.../tika/sax/RecursiveParserWrapperHandler.java | 9 ++++----
.../tika/parser/RecursiveParserWrapperTest.java | 2 +-
.../server/resource/RecursiveMetadataResource.java | 4 ++--
.../tika/server/RecursiveMetadataResourceTest.java | 25 ++++++++++++++++++----
.../org/apache/tika/server/TikaResourceTest.java | 13 +++++++++++
9 files changed, 61 insertions(+), 22 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 4b6c164..b115ef2 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -526,7 +526,7 @@ public class TikaCLI {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
RecursiveParserWrapperHandler handler =
new RecursiveParserWrapperHandler(getContentHandlerFactory(type),
- -1, config.getMetadataFilter());
+ -1, -1, config.getMetadataFilter());
try (InputStream input = TikaInputStream.get(url, metadata)) {
wrapper.parse(input, handler, metadata, context);
}
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
index 9732781..aa8af8f 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
@@ -100,7 +100,7 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer {
List<Metadata> metadataList = null;
Metadata containerMetadata = fileResource.getMetadata();
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory,
- -1, metadataFilter);
+ -1, -1, metadataFilter);
try {
parse(fileResource.getResourceId(), parser, is, handler,
containerMetadata, context);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 021fca3..95899a6 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -227,17 +227,15 @@ public class RecursiveParserWrapper extends ParserDecorator {
long started = System.currentTimeMillis();
parserState.recursiveParserWrapperHandler.startDocument();
TemporaryResources tmp = new TemporaryResources();
- int writeLimit = -1;
- //TODO -- rely on a new interface WriteLimiting...?
- //It'd be better not to tie this to a specific class
- if (recursiveParserWrapperHandler instanceof BasicContentHandlerFactory) {
- writeLimit =
- ((BasicContentHandlerFactory)recursiveParserWrapperHandler).getWriteLimit();
+ int totalWriteLimit = -1;
+ if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) {
+ totalWriteLimit =
+ ((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler).getTotalWriteLimit();
}
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
RecursivelySecureContentHandler secureContentHandler =
- new RecursivelySecureContentHandler(localHandler, tis, writeLimit);
+ new RecursivelySecureContentHandler(localHandler, tis, totalWriteLimit);
context.set(RecursivelySecureContentHandler.class, secureContentHandler);
getWrappedParser().parse(tis, secureContentHandler, metadata, context);
} catch (SAXException e) {
@@ -493,6 +491,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
}
int availableLength = Math.min(totalWriteLimit - totalChars, length);
super.characters(ch, start, availableLength);
+ totalChars += availableLength;
if (availableLength < length) {
throw new WriteLimitReached();
}
@@ -506,9 +505,11 @@ public class RecursiveParserWrapper extends ParserDecorator {
}
int availableLength = Math.min(totalWriteLimit - totalChars, length);
super.ignorableWhitespace(ch, start, availableLength);
+
if (availableLength < length) {
throw new WriteLimitReached();
}
+ totalChars += availableLength;
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
index 8515f09..50c1eca 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
@@ -63,16 +63,19 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl
private static final int MAX_DEPTH = 100;
private final int maxEmbeddedResources;
+ private final int totalWriteLimit;
private int embeddedResources = 0;
private int embeddedDepth = 0;
public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
- this(contentHandlerFactory, -1);
+ this(contentHandlerFactory, -1, -1);
}
- public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) {
+ public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory,
+ int maxEmbeddedResources, int totalWriteLimit) {
this.contentHandlerFactory = contentHandlerFactory;
this.maxEmbeddedResources = maxEmbeddedResources;
+ this.totalWriteLimit = totalWriteLimit;
}
public ContentHandler getNewContentHandler() {
@@ -142,4 +145,8 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl
public ContentHandlerFactory getContentHandlerFactory() {
return contentHandlerFactory;
}
+
+ public int getTotalWriteLimit() {
+ return totalWriteLimit;
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 50f0fb8..a0d620e 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -49,7 +49,7 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
* Create a handler with no limit on the number of embedded resources
*/
public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
- this(contentHandlerFactory, -1, NoOpFilter.NOOP_FILTER);
+ this(contentHandlerFactory, -1, -1, NoOpFilter.NOOP_FILTER);
}
/**
@@ -58,12 +58,13 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
* @param maxEmbeddedResources number of embedded resources that will be parsed
*/
public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) {
- this(contentHandlerFactory, maxEmbeddedResources, NoOpFilter.NOOP_FILTER);
+ this(contentHandlerFactory, maxEmbeddedResources, -1, NoOpFilter.NOOP_FILTER);
}
- public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources,
+ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory,
+ int maxEmbeddedResources, int maxWriteLimit,
MetadataFilter metadataFilter) {
- super(contentHandlerFactory, maxEmbeddedResources);
+ super(contentHandlerFactory, maxEmbeddedResources, maxWriteLimit);
this.metadataFilter = metadataFilter;
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 349f271..e9b74fc 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -381,7 +381,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
-1);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory,
- -1, tikaConfig.getMetadataFilter());
+ -1, -1, tikaConfig.getMetadataFilter());
try (InputStream is = getClass().getResourceAsStream(path)) {
wrapper.parse(is, handler, metadata, context);
}
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
index ec37779..664f776 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
@@ -146,13 +146,13 @@ public class RecursiveMetadataResource {
int maxEmbeddedResources = -1;
if (httpHeaders.containsKey("maxEmbeddedResources")) {
- maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
+ maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
}
BasicContentHandlerFactory.HANDLER_TYPE type =
BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources,
+ new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources, writeLimit,
TikaResource.getConfig().getMetadataFilter());
try {
TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context);
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index d0c84c7..6a70d34 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -18,7 +18,6 @@
package org.apache.tika.server;
import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.tika.TikaTest.assertNotContained;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
@@ -29,6 +28,7 @@ import javax.ws.rs.core.Response;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@@ -342,6 +342,22 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
}
@Test
+ public void testJsonWriteLimitEmbedded() throws Exception {
+ for (int i = 500; i < 8500; i += 500) {
+ Response response = WebClient.create(endPoint + META_PATH + "/text").accept("application/json")
+ .header("writeLimit",
+ Integer.toString(i)).put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ List<Metadata> metadata = JsonMetadataList.fromJson(
+ new InputStreamReader(((InputStream) response.getEntity()), StandardCharsets.UTF_8));
+ int len = 0;
+ for (Metadata m : metadata) {
+ len += m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length();
+ }
+ assertEquals(i, len);
+ }
+ }
+
+ @Test
public void testWriteLimit() throws Exception {
int writeLimit = 10;
Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
@@ -356,7 +372,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
assertEquals("true", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
//now try with a write limit of 1000
- writeLimit = 200;
+ writeLimit = 500;
response = WebClient.create(endPoint + META_PATH).accept("application/json")
.header("writeLimit", Integer.toString(writeLimit))
.put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
@@ -365,9 +381,10 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
// Check results
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
- assertEquals(12, metadataList.size());
+ assertEquals(10, metadataList.size());
+
assertEquals("true", metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
- assertContains("When in the Course of human events it becomes necessary for one people",
+ assertContains("When in the Course of human events it becomes necessary",
metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
TikaTest.assertNotContained("We hold these truths",
metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 68d537d..7511821 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -626,6 +626,19 @@ public class TikaResourceTest extends CXFTestBase {
assertEquals(400, response.getStatus());
}
+ @Test
+ public void testJsonWriteLimitEmbedded() throws Exception {
+ for (int i = 0; i < 8500; i += 500) {
+ Response response = WebClient.create(endPoint + TIKA_PATH + "/text").accept("application/json")
+ .header("writeLimit",
+ Integer.toString(i)).put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+ Metadata metadata = JsonMetadata.fromJson(
+ new InputStreamReader(((InputStream) response.getEntity()), StandardCharsets.UTF_8));
+ int len = metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length();
+ assertEquals(i, len);
+ }
+ }
+
private MultipartBody testPDFLowerCaseOCRConfigPOSTBody() throws FileNotFoundException, URISyntaxException {
ContentDisposition cd = new ContentDisposition(
"form-data; name=\"input\"; filename=\"testOCR.pdf\"");