You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/29 17:26:08 UTC
[tika] 02/02: TIKA-3376 -- improve write limit reached handling in
new /tika json output
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9ac7e759b2007f541375ee2dedc736de5a555ccb
Author: tallison <ta...@apache.org>
AuthorDate: Thu Apr 29 13:25:51 2021 -0400
TIKA-3376 -- improve write limit reached handling in new /tika json output
---
.../tika/exception/WriteLimitReachedException.java | 15 +++++++++++++-
.../apache/tika/parser/RecursiveParserWrapper.java | 23 ++--------------------
.../tika/server/core/resource/TikaResource.java | 10 ++++++++--
.../tika/server/core/TikaResourceNoStackTest.java | 19 ++++++++++++++----
.../apache/tika/server/core/TikaResourceTest.java | 1 +
5 files changed, 40 insertions(+), 28 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
index 15225b4..5bf454f 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
@@ -20,6 +20,9 @@ import org.xml.sax.SAXException;
public class WriteLimitReachedException extends SAXException {
+ //in case of (hopefully impossible) cyclic exception
+ private final static int MAX_DEPTH = 100;
+
public WriteLimitReachedException(String msg) {
super(msg);
}
@@ -34,10 +37,20 @@ public class WriteLimitReachedException extends SAXException {
* @since Apache Tika 2.0
*/
public static boolean isWriteLimitReached(Throwable t) {
+ return isWriteLimitReached(t, 0);
+ }
+
+ private static boolean isWriteLimitReached(Throwable t, int depth) {
+ if (t == null) {
+ return false;
+ }
+ if (depth > MAX_DEPTH) {
+ return false;
+ }
if (t instanceof WriteLimitReachedException) {
return true;
} else {
- return t.getCause() != null && isWriteLimitReached(t.getCause());
+ return t.getCause() != null && isWriteLimitReached(t.getCause(), depth + 1);
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 09309d1..c98c8fb 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -156,7 +156,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
context.set(RecursivelySecureContentHandler.class, secureContentHandler);
getWrappedParser().parse(tis, secureContentHandler, metadata, context);
} catch (SAXException e) {
- boolean wlr = isWriteLimitReached(e);
+ boolean wlr = WriteLimitReachedException.isWriteLimitReached(e);
if (wlr == false) {
throw e;
}
@@ -176,25 +176,6 @@ public class RecursiveParserWrapper extends ParserDecorator {
}
}
- /**
- * Copied/modified from WriteOutContentHandler. Couldn't make that
- * static, and we need to have something that will work
- * with exceptions thrown from both BodyContentHandler and WriteOutContentHandler
- *
- * @param t
- * @return
- */
- private boolean isWriteLimitReached(Throwable t) {
- if (t instanceof WriteLimitReachedException) {
- return true;
- } else if (t.getMessage() != null &&
- t.getMessage().indexOf("Your document contained more than") == 0) {
- return true;
- } else {
- return t.getCause() != null && isWriteLimitReached(t.getCause());
- }
- }
-
private String getResourceName(Metadata metadata, ParserState state) {
String objectName = "";
if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) {
@@ -259,7 +240,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
try {
super.parse(stream, secureContentHandler, metadata, context);
} catch (SAXException e) {
- boolean wlr = isWriteLimitReached(e);
+ boolean wlr = WriteLimitReachedException.isWriteLimitReached(e);
if (wlr == true) {
metadata.add(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
} else {
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index e7dcf83..1d52857 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -66,6 +66,7 @@ import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.AutoDetectParser;
@@ -574,8 +575,13 @@ public class TikaResource {
try {
parse(parser, LOG, info.getPath(), inputStream, contentHandler, metadata, context);
} catch (TikaServerParseException e) {
+ Throwable cause = e.getCause();
+ boolean writeLimitReached = false;
+ if (WriteLimitReachedException.isWriteLimitReached(cause)) {
+ metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
+ writeLimitReached = true;
+ }
if (tikaServerConfig.isReturnStackTrace()) {
- Throwable cause = e.getCause();
if (cause != null) {
metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
ExceptionUtils.getStackTrace(cause));
@@ -583,7 +589,7 @@ public class TikaResource {
metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
ExceptionUtils.getStackTrace(e));
}
- } else {
+ } else if (! writeLimitReached) {
throw e;
}
} catch (OutOfMemoryError e) {
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceNoStackTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceNoStackTest.java
index 43a3620..0588697 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceNoStackTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceNoStackTest.java
@@ -17,9 +17,12 @@
package org.apache.tika.server.core;
+import static org.apache.tika.TikaTest.assertNotContained;
import static org.junit.Assert.assertEquals;
import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import javax.ws.rs.core.Response;
@@ -29,6 +32,9 @@ import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.junit.Test;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadata;
import org.apache.tika.server.core.resource.TikaResource;
import org.apache.tika.server.core.writer.JSONMessageBodyWriter;
@@ -71,7 +77,7 @@ public class TikaResourceNoStackTest extends CXFTestBase {
Response response = WebClient.create(endPoint + TIKA_PATH).accept(
"application/json")
.put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
- assertEquals(422, response.getStatus());
+ assertEquals(UNPROCESSEABLE, response.getStatus());
String content = getStringFromInputStream((InputStream) response.getEntity());
assertEquals(0, content.length());
}
@@ -82,9 +88,14 @@ public class TikaResourceNoStackTest extends CXFTestBase {
.header("writeLimit", "100")
.accept("application/json")
.put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
- assertEquals(500, response.getStatus());
- String content = getStringFromInputStream((InputStream) response.getEntity());
- assertEquals(0, content.length());
+ assertEquals(200, response.getStatus());
+ Metadata metadata =
+ JsonMetadata.fromJson(new InputStreamReader((InputStream) response.getEntity(),
+ StandardCharsets.UTF_8));
+ assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+ assertContains("When in the Course of human events",
+ metadata.get(TikaCoreProperties.TIKA_CONTENT));
+ assertNotContained("political bands", metadata.get(TikaCoreProperties.TIKA_CONTENT));
}
}
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
index fc392cb..e818dc7 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
@@ -171,6 +171,7 @@ public class TikaResourceTest extends CXFTestBase {
assertTrue(metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION).startsWith(
"org.apache.tika.exception.WriteLimitReachedException"
));
+ assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
}
@Test