You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/26 17:55:35 UTC
[tika] branch main updated: TIKA-4055 -- fix bug in writelimit checks in RecursiveParserWrapper and a separate bug in /rmeta (#1156)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new f41d8c35a TIKA-4055 -- fix bug in writelimit checks in RecursiveParserWrapper and a separate bug in /rmeta (#1156)
f41d8c35a is described below
commit f41d8c35a78e845fc1adf548e8eea3df5463a63b
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri May 26 13:55:29 2023 -0400
TIKA-4055 -- fix bug in writelimit checks in RecursiveParserWrapper and a separate bug in /rmeta (#1156)
---
CHANGES.txt | 6 +++
.../apache/tika/parser/RecursiveParserWrapper.java | 2 +
.../tika/parser/RecursiveParserWrapperTest.java | 61 +++++++++++++++++++---
.../src/test/resources/log4j.properties | 2 +-
.../core/resource/RecursiveMetadataResource.java | 4 +-
.../standard/RecursiveMetadataResourceTest.java | 14 ++---
6 files changed, 73 insertions(+), 16 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index b3ac0be3b..5526b5f86 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,9 @@
+Release 2.8.1 - ???
+
+ * Fixed write limit bug in RecursiveParserWrapper (TIKA-4055).
+
+ * Add mime detection for many files (TIKA-3992).
+
Release 2.8.0 - 5/11/2023
* Enable counting and/or parsing of incremental updates in PDFs. This
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 483181b0a..e8f029770 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -372,6 +372,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
}
int availableLength = Math.min(totalWriteLimit - totalChars, length);
super.characters(ch, start, availableLength);
+ totalChars += availableLength;
if (availableLength < length) {
handleWriteLimitReached();
}
@@ -389,6 +390,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
}
int availableLength = Math.min(totalWriteLimit - totalChars, length);
super.ignorableWhitespace(ch, start, availableLength);
+ totalChars += availableLength;
if (availableLength < length) {
handleWriteLimitReached();
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 24800926a..61eeab14d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -93,14 +93,15 @@ public class RecursiveParserWrapperTest extends TikaTest {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 70));
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+ 70));
try (InputStream stream =
getResourceAsStream("/test-documents/test_recursive_embedded.docx")) {
wrapper.parse(stream, handler, metadata, context);
}
List<Metadata> list = handler.getMetadataList();
- assertEquals(5, list.size());
+ assertEquals(2, list.size());
int wlr = 0;
for (Metadata m : list) {
@@ -112,15 +113,31 @@ public class RecursiveParserWrapperTest extends TikaTest {
assertEquals(2, wlr);
}
+ @Test
+ public void testOne() throws Exception {
+ ParseContext context = new ParseContext();
+ Metadata metadata = new Metadata();
+ int writeLimit = 100;
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+ RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+ writeLimit, false, context));
+ try (InputStream stream = getResourceAsStream(
+ "/test-documents/test_recursive_embedded" + ".docx")) {
+ wrapper.parse(stream, handler, metadata, context);
+ }
+ List<Metadata> list = handler.getMetadataList();
+ assertEquals(12, list.size());
+ }
@Test
public void testCharLimitNoThrowOnWriteLimit() throws Exception {
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
-
+ int writeLimit = 500;
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 500,
- false, context));
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+ writeLimit, false, context));
try (InputStream stream = getResourceAsStream("/test-documents/test_recursive_embedded" +
".docx")) {
wrapper.parse(stream, handler, metadata, context);
@@ -131,11 +148,41 @@ public class RecursiveParserWrapperTest extends TikaTest {
assertEquals("true", list.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
- assertContains("them to the separation", list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
- assertNotContained("unalienable Rights",
+ assertContains("dissolve the political", list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
+ assertNotContained("them to the separation",
list.get(6).get(TikaCoreProperties.TIKA_CONTENT));
}
+ @Test
+ public void testSpecificLimit() throws Exception {
+ int writeLimit = 60;
+
+ ParseContext context = new ParseContext();
+ Metadata metadata = new Metadata();
+
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
+ RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+ writeLimit, false, context));
+ try (InputStream stream = getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf")) {
+ wrapper.parse(stream, handler, metadata, context);
+ }
+ List<Metadata> list = handler.getMetadataList();
+ assertTrue(writeLimit >= getContentLength(list),
+ "writeLimit=" + writeLimit + " contentLength=" + getContentLength(list));
+ }
+
+ private int getContentLength(List<Metadata> metadataList) {
+ int sz = 0;
+ for (Metadata metadata : metadataList) {
+ String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+ if (content != null) {
+ sz += content.length();
+ }
+ }
+ return sz;
+ }
+
@Test
public void testMaxEmbedded() throws Exception {
int maxEmbedded = 4;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
index 8c106427a..bd6faa7a8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/log4j.properties
@@ -15,7 +15,7 @@
# limitations under the License.
#info,debug, error,fatal ...
-log4j.rootLogger=info,stdout
+log4j.rootLogger=error,stdout
#console
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
index 76e24b926..ac4837110 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java
@@ -69,9 +69,11 @@ public class RecursiveMetadataResource {
fillMetadata(parser, metadata, httpHeaders);
fillParseContext(httpHeaders, metadata, context);
TikaResource.logRequest(LOG, "/rmeta", metadata);
+
BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType();
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit()),
+ new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(),
+ handlerConfig.isThrowOnWriteLimitReached(), context),
handlerConfig.getMaxEmbeddedResources(),
TikaResource.getConfig().getMetadataFilter());
try {
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
index 3de5c0e65..691554edb 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java
@@ -345,8 +345,8 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
assertEquals(1, metadataList.size());
assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
- //now try with a write limit of 200
- writeLimit = 200;
+ //now try with a write limit of 500
+ writeLimit = 550;
response = WebClient.create(endPoint + META_PATH).accept("application/json")
.header("writeLimit", Integer.toString(writeLimit))
.put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
@@ -390,11 +390,11 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
// Check results
Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
- assertEquals(10, metadataList.size());
+ assertEquals(12, metadataList.size());
assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
- //now try with a write limit of 200
- writeLimit = 200;
+ //now try with a write limit of 550
+ writeLimit = 550;
response = WebClient.create(endPoint + META_PATH).accept("application/json")
.header("writeLimit", Integer.toString(writeLimit))
.header("throwOnWriteLimitReached", "false")
@@ -404,8 +404,8 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
// Check results
reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
metadataList = JsonMetadataList.fromJson(reader);
- assertEquals(10, metadataList.size());
- assertEquals("true", metadataList.get(6).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+ assertEquals(12, metadataList.size());
+ assertEquals("true", metadataList.get(0).get(TikaCoreProperties.WRITE_LIMIT_REACHED));
assertContains("When in the Course of human events it becomes necessary for one people",
metadataList.get(6).get(TikaCoreProperties.TIKA_CONTENT));
TikaTest.assertNotContained("We hold these truths",