You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/26 16:13:27 UTC

[tika] branch TIKA-4055 updated (a2267afc6 -> a61e785a8)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4055
in repository https://gitbox.apache.org/repos/asf/tika.git


 discard a2267afc6 TIKA-4002 -- add mime type detection for pcapng
     new a61e785a8 TIKA-4055 -- fix bug in writelimit checks in RecursiveParserWrapper and a separate bug in /rmeta

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (a2267afc6)
            \
             N -- N -- N   refs/heads/TIKA-4055 (a61e785a8)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:


[tika] 01/01: TIKA-4055 -- fix bug in writelimit checks in RecursiveParserWrapper and a separate bug in /rmeta

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4055
in repository https://gitbox.apache.org/repos/asf/tika.git

commit a61e785a81078687d3593b6e00c89f1a9b0e07c4
Author: tballison <ta...@apache.org>
AuthorDate: Fri May 26 12:09:52 2023 -0400

    TIKA-4055 -- fix bug in writelimit checks in RecursiveParserWrapper and a separate bug in /rmeta
---
 CHANGES.txt                                        |  6 +++
 .../apache/tika/parser/RecursiveParserWrapper.java |  2 +
 .../tika/parser/RecursiveParserWrapperTest.java    | 61 +++++++++++++++++++---
 .../src/test/resources/log4j.properties            |  2 +-
 .../core/resource/RecursiveMetadataResource.java   |  4 +-
 .../standard/RecursiveMetadataResourceTest.java    | 14 ++---
 6 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index b3ac0be3b..5526b5f86 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Release 2.8.1 - ???
+
+   * Fixed write limit bug in RecursiveParserWrapper (TIKA-4055).
+
+   * Add mime detection for many files (TIKA-3992).
+
 Release 2.8.0 - 5/11/2023
 
    * Enable counting and/or parsing of incremental updates in PDFs.  This
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 483181b0a..e8f029770 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -372,6 +372,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
             }
             int availableLength = Math.min(totalWriteLimit - totalChars, length);
             super.characters(ch, start, availableLength);
+            totalChars += availableLength;
             if (availableLength < length) {
                 handleWriteLimitReached();
             }
@@ -389,6 +390,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
             }
             int availableLength = Math.min(totalWriteLimit - totalChars, length);
             super.ignorableWhitespace(ch, start, availableLength);
+            totalChars += availableLength;
             if (availableLength < length) {
                 handleWriteLimitReached();
             }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 24800926a..61eeab14d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -93,14 +93,15 @@ public class RecursiveParserWrapperTest extends TikaTest {
 
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 70));
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        70));
         try (InputStream stream =
                     getResourceAsStream("/test-documents/test_recursive_embedded.docx")) {
             wrapper.parse(stream, handler, metadata, context);
         }
         List<Metadata> list = handler.getMetadataList();
 
-        assertEquals(5, list.size());
+        assertEquals(2, list.size());
 
         int wlr = 0;
         for (Metadata m : list) {
@@ -112,15 +113,31 @@ public class RecursiveParserWrapperTest extends TikaTest {
         assertEquals(2, wlr);
     }
 
+    @Test
+    public void testOne() throws Exception {
+        ParseContext context = new ParseContext();
+        Metadata metadata = new Metadata();
+        int writeLimit = 100;
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        writeLimit, false, context));
+        try (InputStream stream = getResourceAsStream(
+                "/test-documents/test_recursive_embedded" + ".docx")) {
+            wrapper.parse(stream, handler, metadata, context);
+        }
+        List<Metadata> list = handler.getMetadataList();
+        assertEquals(12, list.size());
+    }
     @Test
     public void testCharLimitNoThrowOnWriteLimit() throws Exception {
         ParseContext context = new ParseContext();
         Metadata metadata = new Metadata();
-
+        int writeLimit = 500;
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 500,
-                        false, context));
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        writeLimit, false, context));
         try (InputStream stream = getResourceAsStream("/test-documents/test_recursive_embedded" +
                 ".docx")) {
             wrapper.parse(stream, handler, metadata, context);
@@ -131,11 +148,41 @@ public class RecursiveParserWrapperTest extends TikaTest {
 
         assertEquals("true", list.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
 
-        assertContains("them to the separation", list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
-        assertNotContained("unalienable Rights",
+        assertContains("dissolve the political", list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+        assertNotContained("them to the separation",
                 list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
     }
 
+    @Test
+    public void testSpecificLimit() throws Exception {
+        int writeLimit = 60;
+
+        ParseContext context = new ParseContext();
+        Metadata metadata = new Metadata();
+
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        writeLimit, false, context));
+        try (InputStream stream = getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf")) {
+            wrapper.parse(stream, handler, metadata, context);
+        }
+        List<Metadata> list = handler.getMetadataList();
+        assertTrue(writeLimit >= getContentLength(list),
+                "writeLimit=" + writeLimit + " contentLength=" + getContentLength(list));
+    }
+
+    private int getContentLength(List<Metadata> metadataList) {
+        int sz = 0;
+        for (Metadata metadata : metadataList) {
+            String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+            if (content != null) {
+                sz += content.length();
+            }
+        }
+        return sz;
+    }
+
     @Test
     public void testMaxEmbedded() throws Exception {
         int maxEmbedded = 4;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
index 8c106427a..bd6faa7a8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
@@ -15,7 +15,7 @@
 # limitations under the License.
 
 #info,debug, error,fatal ...
-log4j.rootLogger=info,stdout
+log4j.rootLogger=error,stdout
 
 #console
 log4j.appender.stdout=org.apache.log4j.ConsoleAppender
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index 76e24b926..ac4837110 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -69,9 +69,11 @@ public class RecursiveMetadataResource {
         fillMetadata(parser, metadata, httpHeaders);
         fillParseContext(httpHeaders, metadata, context);
         TikaResource.logRequest(LOG, "/rmeta", metadata);
+
         BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType();
         RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-                new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit()),
+                new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(),
+                        handlerConfig.isThrowOnWriteLimitReached(), context),
                 handlerConfig.getMaxEmbeddedResources(),
                 TikaResource.getConfig().getMetadataFilter());
         try {
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
index 3de5c0e65..691554edb 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
@@ -345,8 +345,8 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         assertEquals(1, metadataList.size());
         assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
 
-        //now try with a write limit of 200
-        writeLimit = 200;
+        //now try with a write limit of 500
+        writeLimit = 550;
         response = WebClient.create(endPoint + META_PATH).accept("application/json")
                 .header("writeLimit", Integer.toString(writeLimit))
                 .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
@@ -390,11 +390,11 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         // Check results
         Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
         List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
-        assertEquals(10, metadataList.size());
+        assertEquals(12, metadataList.size());
         assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
 
-        //now try with a write limit of 200
-        writeLimit = 200;
+        //now try with a write limit of 550
+        writeLimit = 550;
         response = WebClient.create(endPoint + META_PATH).accept("application/json")
                 .header("writeLimit", Integer.toString(writeLimit))
                 .header("throwOnWriteLimitReached", "false")
@@ -404,8 +404,8 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
         // Check results
         reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
         metadataList = JsonMetadataList.fromJson(reader);
-        assertEquals(10, metadataList.size());
-        assertEquals("true", metadataList.get(6).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+        assertEquals(12, metadataList.size());
+        assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
         assertContains("When in the Course of human events it becomes necessary for one people",
                 metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
         TikaTest.assertNotContained("We hold these truths",