You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/28 16:05:01 UTC

[tika] 04/05: TIKA-3372 -- fix write limit in recursive parser handler

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit bc551b18da2fb540a360e3300660490a046b457b
Author: tballison <ta...@apache.org>
AuthorDate: Wed Apr 28 12:04:33 2021 -0400

    TIKA-3372 -- fix write limit in recursive parser handler
---
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  2 +-
 .../batch/fs/RecursiveParserWrapperFSConsumer.java |  2 +-
 .../apache/tika/parser/RecursiveParserWrapper.java | 15 +++++++------
 .../sax/AbstractRecursiveParserWrapperHandler.java | 11 ++++++++--
 .../tika/sax/RecursiveParserWrapperHandler.java    |  9 ++++----
 .../tika/parser/RecursiveParserWrapperTest.java    |  2 +-
 .../server/resource/RecursiveMetadataResource.java |  4 ++--
 .../tika/server/RecursiveMetadataResourceTest.java | 25 ++++++++++++++++++----
 .../org/apache/tika/server/TikaResourceTest.java   | 13 +++++++++++
 9 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 4b6c164..b115ef2 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -526,7 +526,7 @@ public class TikaCLI {
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
         RecursiveParserWrapperHandler handler =
                 new RecursiveParserWrapperHandler(getContentHandlerFactory(type),
-                        -1, config.getMetadataFilter());
+                        -1, -1, config.getMetadataFilter());
         try (InputStream input = TikaInputStream.get(url, metadata)) {
             wrapper.parse(input, handler, metadata, context);
         }
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
index 9732781..aa8af8f 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
@@ -100,7 +100,7 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer {
         List<Metadata> metadataList = null;
         Metadata containerMetadata = fileResource.getMetadata();
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory,
-                -1, metadataFilter);
+                -1, -1, metadataFilter);
         try {
             parse(fileResource.getResourceId(), parser, is, handler,
                     containerMetadata, context);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 021fca3..95899a6 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -227,17 +227,15 @@ public class RecursiveParserWrapper extends ParserDecorator {
         long started = System.currentTimeMillis();
         parserState.recursiveParserWrapperHandler.startDocument();
         TemporaryResources tmp = new TemporaryResources();
-        int writeLimit = -1;
-        //TODO -- rely on a new interface WriteLimiting...?
-        //It'd be better not to tie this to a specific class
-        if (recursiveParserWrapperHandler instanceof BasicContentHandlerFactory) {
-            writeLimit =
-                    ((BasicContentHandlerFactory)recursiveParserWrapperHandler).getWriteLimit();
+        int totalWriteLimit = -1;
+        if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) {
+            totalWriteLimit =
+                    ((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler).getTotalWriteLimit();
         }
         try {
             TikaInputStream tis = TikaInputStream.get(stream, tmp);
             RecursivelySecureContentHandler secureContentHandler =
-                        new RecursivelySecureContentHandler(localHandler, tis, writeLimit);
+                        new RecursivelySecureContentHandler(localHandler, tis, totalWriteLimit);
             context.set(RecursivelySecureContentHandler.class, secureContentHandler);
             getWrappedParser().parse(tis, secureContentHandler, metadata, context);
         } catch (SAXException e) {
@@ -493,6 +491,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
             }
             int availableLength = Math.min(totalWriteLimit - totalChars, length);
             super.characters(ch, start, availableLength);
+            totalChars += availableLength;
             if (availableLength < length) {
                 throw new WriteLimitReached();
             }
@@ -506,9 +505,11 @@ public class RecursiveParserWrapper extends ParserDecorator {
             }
             int availableLength = Math.min(totalWriteLimit - totalChars, length);
             super.ignorableWhitespace(ch, start, availableLength);
+
             if (availableLength < length) {
                 throw new WriteLimitReached();
             }
+            totalChars += availableLength;
         }
     }
 
diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
index 8515f09..50c1eca 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
@@ -63,16 +63,19 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl
     private static final int MAX_DEPTH = 100;
 
     private final int maxEmbeddedResources;
+    private final int totalWriteLimit;
     private int embeddedResources = 0;
     private int embeddedDepth = 0;
 
     public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
-        this(contentHandlerFactory, -1);
+        this(contentHandlerFactory, -1, -1);
     }
 
-    public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) {
+    public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory,
+                                                 int maxEmbeddedResources, int totalWriteLimit) {
         this.contentHandlerFactory = contentHandlerFactory;
         this.maxEmbeddedResources = maxEmbeddedResources;
+        this.totalWriteLimit = totalWriteLimit;
     }
 
     public ContentHandler getNewContentHandler() {
@@ -142,4 +145,8 @@ public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandl
     public ContentHandlerFactory getContentHandlerFactory() {
         return contentHandlerFactory;
     }
+
+    public int getTotalWriteLimit() {
+        return totalWriteLimit;
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 50f0fb8..a0d620e 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -49,7 +49,7 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
      * Create a handler with no limit on the number of embedded resources
      */
     public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
-        this(contentHandlerFactory, -1, NoOpFilter.NOOP_FILTER);
+        this(contentHandlerFactory, -1, -1, NoOpFilter.NOOP_FILTER);
     }
 
     /**
@@ -58,12 +58,13 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
      * @param maxEmbeddedResources number of embedded resources that will be parsed
      */
     public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) {
-        this(contentHandlerFactory, maxEmbeddedResources, NoOpFilter.NOOP_FILTER);
+        this(contentHandlerFactory, maxEmbeddedResources, -1, NoOpFilter.NOOP_FILTER);
     }
 
-    public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources,
+    public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory,
+                                         int maxEmbeddedResources, int maxWriteLimit,
                                          MetadataFilter metadataFilter) {
-        super(contentHandlerFactory, maxEmbeddedResources);
+        super(contentHandlerFactory, maxEmbeddedResources, maxWriteLimit);
         this.metadataFilter = metadataFilter;
     }
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 349f271..e9b74fc 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -381,7 +381,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
                         -1);
 
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory,
-                -1, tikaConfig.getMetadataFilter());
+                -1, -1, tikaConfig.getMetadataFilter());
         try (InputStream is = getClass().getResourceAsStream(path)) {
             wrapper.parse(is, handler, metadata, context);
         }
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
index ec37779..664f776 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
@@ -146,13 +146,13 @@ public class RecursiveMetadataResource {
 
         int maxEmbeddedResources = -1;
         if (httpHeaders.containsKey("maxEmbeddedResources")) {
-        maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
+            maxEmbeddedResources = Integer.parseInt(httpHeaders.getFirst("maxEmbeddedResources"));
         }
 
         BasicContentHandlerFactory.HANDLER_TYPE type =
                 BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
 		RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-		        new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources,
+		        new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources, writeLimit,
                 TikaResource.getConfig().getMetadataFilter());
 		try {
             TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context);
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index d0c84c7..6a70d34 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -18,7 +18,6 @@
 package org.apache.tika.server;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.apache.tika.TikaTest.assertNotContained;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
@@ -29,6 +28,7 @@ import javax.ws.rs.core.Response;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -342,6 +342,22 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
     }
 
     @Test
+    public void testJsonWriteLimitEmbedded() throws Exception {
+        for (int i = 500; i < 8500; i += 500) {
+            Response response = WebClient.create(endPoint + META_PATH + "/text").accept("application/json")
+                    .header("writeLimit",
+                            Integer.toString(i)).put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+            List<Metadata> metadata = JsonMetadataList.fromJson(
+                    new InputStreamReader(((InputStream) response.getEntity()), StandardCharsets.UTF_8));
+            int len = 0;
+            for (Metadata m : metadata) {
+                len += m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length();
+            }
+            assertEquals(i, len);
+        }
+    }
+
+    @Test
     public void testWriteLimit() throws Exception {
         int writeLimit = 10;
         Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
@@ -356,7 +372,7 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         assertEquals("true", metadataList.get(0).get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
 
         //now try with a write limit of 1000
-        writeLimit = 200;
+        writeLimit = 500;
         response = WebClient.create(endPoint + META_PATH).accept("application/json")
                 .header("writeLimit", Integer.toString(writeLimit))
                 .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
@@ -365,9 +381,10 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         // Check results
         reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
         metadataList = JsonMetadataList.fromJson(reader);
-        assertEquals(12, metadataList.size());
+        assertEquals(10, metadataList.size());
+
         assertEquals("true", metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
-        assertContains("When in the Course of human events it becomes necessary for one people",
+        assertContains("When in the Course of human events it becomes necessary",
                 metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
         TikaTest.assertNotContained("We hold these truths",
                 metadataList.get(6).get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT));
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 68d537d..7511821 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -626,6 +626,19 @@ public class TikaResourceTest extends CXFTestBase {
         assertEquals(400, response.getStatus());
     }
 
+    @Test
+    public void testJsonWriteLimitEmbedded() throws Exception {
+        for (int i = 0; i < 8500; i += 500) {
+            Response response = WebClient.create(endPoint + TIKA_PATH + "/text").accept("application/json")
+                    .header("writeLimit",
+                            Integer.toString(i)).put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+            Metadata metadata = JsonMetadata.fromJson(
+                    new InputStreamReader(((InputStream) response.getEntity()), StandardCharsets.UTF_8));
+            int len = metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT).length();
+            assertEquals(i, len);
+        }
+    }
+
     private MultipartBody testPDFLowerCaseOCRConfigPOSTBody() throws FileNotFoundException, URISyntaxException {
         ContentDisposition cd = new ContentDisposition(
                 "form-data; name=\"input\"; filename=\"testOCR.pdf\"");