You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/26 17:55:35 UTC

[tika] branch main updated: TIKA-4055 -- fix bug in writelimit checks in RecursiveParserWrapper and a separate bug in /rmeta (#1156)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new f41d8c35a TIKA-4055 -- fix bug in writelimit checks in RecursiveParserWrapper and a separate bug in /rmeta (#1156)
f41d8c35a is described below

commit f41d8c35a78e845fc1adf548e8eea3df5463a63b
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri May 26 13:55:29 2023 -0400

    TIKA-4055 -- fix bug in writelimit checks in RecursiveParserWrapper and a separate bug in /rmeta (#1156)
---
 CHANGES.txt                                        |  6 +++
 .../apache/tika/parser/RecursiveParserWrapper.java |  2 +
 .../tika/parser/RecursiveParserWrapperTest.java    | 61 +++++++++++++++++++---
 .../src/test/resources/log4j.properties            |  2 +-
 .../core/resource/RecursiveMetadataResource.java   |  4 +-
 .../standard/RecursiveMetadataResourceTest.java    | 14 ++---
 6 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index b3ac0be3b..5526b5f86 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Release 2.8.1 - ???
+
+   * Fixed write limit bug in RecursiveParserWrapper (TIKA-4055).
+
+   * Add mime detection for many files (TIKA-3992).
+
 Release 2.8.0 - 5/11/2023
 
    * Enable counting and/or parsing of incremental updates in PDFs.  This
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 483181b0a..e8f029770 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -372,6 +372,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
             }
             int availableLength = Math.min(totalWriteLimit - totalChars, length);
             super.characters(ch, start, availableLength);
+            totalChars += availableLength;
             if (availableLength < length) {
                 handleWriteLimitReached();
             }
@@ -389,6 +390,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
             }
             int availableLength = Math.min(totalWriteLimit - totalChars, length);
             super.ignorableWhitespace(ch, start, availableLength);
+            totalChars += availableLength;
             if (availableLength < length) {
                 handleWriteLimitReached();
             }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 24800926a..61eeab14d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -93,14 +93,15 @@ public class RecursiveParserWrapperTest extends TikaTest {
 
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 70));
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        70));
         try (InputStream stream =
                     getResourceAsStream("/test-documents/test_recursive_embedded.docx")) {
             wrapper.parse(stream, handler, metadata, context);
         }
         List<Metadata> list = handler.getMetadataList();
 
-        assertEquals(5, list.size());
+        assertEquals(2, list.size());
 
         int wlr = 0;
         for (Metadata m : list) {
@@ -112,15 +113,31 @@ public class RecursiveParserWrapperTest extends TikaTest {
         assertEquals(2, wlr);
     }
 
+    @Test
+    public void testOne() throws Exception {
+        ParseContext context = new ParseContext();
+        Metadata metadata = new Metadata();
+        int writeLimit = 100;
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        writeLimit, false, context));
+        try (InputStream stream = getResourceAsStream(
+                "/test-documents/test_recursive_embedded" + ".docx")) {
+            wrapper.parse(stream, handler, metadata, context);
+        }
+        List<Metadata> list = handler.getMetadataList();
+        assertEquals(12, list.size());
+    }
     @Test
     public void testCharLimitNoThrowOnWriteLimit() throws Exception {
         ParseContext context = new ParseContext();
         Metadata metadata = new Metadata();
-
+        int writeLimit = 500;
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 500,
-                        false, context));
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        writeLimit, false, context));
         try (InputStream stream = getResourceAsStream("/test-documents/test_recursive_embedded" +
                 ".docx")) {
             wrapper.parse(stream, handler, metadata, context);
@@ -131,11 +148,41 @@ public class RecursiveParserWrapperTest extends TikaTest {
 
         assertEquals("true", list.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
 
-        assertContains("them to the separation", list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
-        assertNotContained("unalienable Rights",
+        assertContains("dissolve the political", list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+        assertNotContained("them to the separation",
                 list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
     }
 
+    @Test
+    public void testSpecificLimit() throws Exception {
+        int writeLimit = 60;
+
+        ParseContext context = new ParseContext();
+        Metadata metadata = new Metadata();
+
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        writeLimit, false, context));
+        try (InputStream stream = getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf")) {
+            wrapper.parse(stream, handler, metadata, context);
+        }
+        List<Metadata> list = handler.getMetadataList();
+        assertTrue(writeLimit >= getContentLength(list),
+                "writeLimit=" + writeLimit + " contentLength=" + getContentLength(list));
+    }
+
+    private int getContentLength(List<Metadata> metadataList) {
+        int sz = 0;
+        for (Metadata metadata : metadataList) {
+            String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+            if (content != null) {
+                sz += content.length();
+            }
+        }
+        return sz;
+    }
+
     @Test
     public void testMaxEmbedded() throws Exception {
         int maxEmbedded = 4;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
index 8c106427a..bd6faa7a8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 #info,debug, error,fatal ...
-log4j.rootLogger=info,stdout
+log4j.rootLogger=error,stdout
 
 #console
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index 76e24b926..ac4837110 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -69,9 +69,11 @@ public class RecursiveMetadataResource {
         fillMetadata(parser, metadata, httpHeaders);
         fillParseContext(httpHeaders, metadata, context);
         TikaResource.logRequest(LOG, "/rmeta", metadata);
+
         BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType();
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-                new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit()),
+                new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(),
+                        handlerConfig.isThrowOnWriteLimitReached(), context),
                 handlerConfig.getMaxEmbeddedResources(),
                 TikaResource.getConfig().getMetadataFilter());
         try {
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
index 3de5c0e65..691554edb 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
@@ -345,8 +345,8 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         assertEquals(1, metadataList.size());
         assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
 
-        //now try with a write limit of 200
-        writeLimit = 200;
+        //now try with a write limit of 500
+        writeLimit = 550;
         response = WebClient.create(endPoint + META_PATH).accept("application/json")
                 .header("writeLimit", Integer.toString(writeLimit))
                 .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
@@ -390,11 +390,11 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         // Check results
         Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
         List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
-        assertEquals(10, metadataList.size());
+        assertEquals(12, metadataList.size());
         assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
 
-        //now try with a write limit of 200
-        writeLimit = 200;
+        //now try with a write limit of 550
+        writeLimit = 550;
         response = WebClient.create(endPoint + META_PATH).accept("application/json")
                 .header("writeLimit", Integer.toString(writeLimit))
                 .header("throwOnWriteLimitReached", "false")
@@ -404,8 +404,8 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         // Check results
         reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
         metadataList = JsonMetadataList.fromJson(reader);
-        assertEquals(10, metadataList.size());
-        assertEquals("true", metadataList.get(6).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+        assertEquals(12, metadataList.size());
+        assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
         assertContains("When in the Course of human events it becomes necessary for one people",
                 metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
         TikaTest.assertNotContained("We hold these truths",