You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/29 17:26:08 UTC

[tika] 02/02: TIKA-3376 -- improve write limit reached handling in new /tika json output

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9ac7e759b2007f541375ee2dedc736de5a555ccb
Author: tallison <ta...@apache.org>
AuthorDate: Thu Apr 29 13:25:51 2021 -0400

    TIKA-3376 -- improve write limit reached handling in new /tika json output
---
 .../tika/exception/WriteLimitReachedException.java | 15 +++++++++++++-
 .../apache/tika/parser/RecursiveParserWrapper.java | 23 ++--------------------
 .../tika/server/core/resource/TikaResource.java    | 10 ++++++++--
 .../tika/server/core/TikaResourceNoStackTest.java  | 19 ++++++++++++++----
 .../apache/tika/server/core/TikaResourceTest.java  |  1 +
 5 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
index 15225b4..5bf454f 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
@@ -20,6 +20,9 @@ import org.xml.sax.SAXException;
 
 public class WriteLimitReachedException extends SAXException {
 
+    //in case of (hopefully impossible) cyclic exception
+    private final static int MAX_DEPTH = 100;
+
     public WriteLimitReachedException(String msg) {
         super(msg);
     }
@@ -34,10 +37,20 @@ public class WriteLimitReachedException extends SAXException {
      * @since Apache Tika 2.0
      */
     public static boolean isWriteLimitReached(Throwable t) {
+        return isWriteLimitReached(t, 0);
+    }
+
+    private static boolean isWriteLimitReached(Throwable t, int depth) {
+        if (t == null) {
+            return false;
+        }
+        if (depth > MAX_DEPTH) {
+            return false;
+        }
         if (t instanceof WriteLimitReachedException) {
             return true;
         } else {
-            return t.getCause() != null && isWriteLimitReached(t.getCause());
+            return t.getCause() != null && isWriteLimitReached(t.getCause(), depth + 1);
         }
     }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 09309d1..c98c8fb 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -156,7 +156,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
             context.set(RecursivelySecureContentHandler.class, secureContentHandler);
             getWrappedParser().parse(tis, secureContentHandler, metadata, context);
         } catch (SAXException e) {
-            boolean wlr = isWriteLimitReached(e);
+            boolean wlr = WriteLimitReachedException.isWriteLimitReached(e);
             if (wlr == false) {
                 throw e;
             }
@@ -176,25 +176,6 @@ public class RecursiveParserWrapper extends ParserDecorator {
         }
     }
 
-    /**
-     * Copied/modified from WriteOutContentHandler.  Couldn't make that
-     * static, and we need to have something that will work
-     * with exceptions thrown from both BodyContentHandler and WriteOutContentHandler
-     *
-     * @param t
-     * @return
-     */
-    private boolean isWriteLimitReached(Throwable t) {
-        if (t instanceof WriteLimitReachedException) {
-            return true;
-        } else if (t.getMessage() != null &&
-                t.getMessage().indexOf("Your document contained more than") == 0) {
-            return true;
-        } else {
-            return t.getCause() != null && isWriteLimitReached(t.getCause());
-        }
-    }
-
     private String getResourceName(Metadata metadata, ParserState state) {
         String objectName = "";
         if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) {
@@ -259,7 +240,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
             try {
                 super.parse(stream, secureContentHandler, metadata, context);
             } catch (SAXException e) {
-                boolean wlr = isWriteLimitReached(e);
+                boolean wlr = WriteLimitReachedException.isWriteLimitReached(e);
                 if (wlr == true) {
                     metadata.add(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
                 } else {
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index e7dcf83..1d52857 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -66,6 +66,7 @@ import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
@@ -574,8 +575,13 @@ public class TikaResource {
         try {
             parse(parser, LOG, info.getPath(), inputStream, contentHandler, metadata, context);
         } catch (TikaServerParseException e) {
+            Throwable cause = e.getCause();
+            boolean writeLimitReached = false;
+            if (WriteLimitReachedException.isWriteLimitReached(cause)) {
+                metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
+                writeLimitReached = true;
+            }
             if (tikaServerConfig.isReturnStackTrace()) {
-                Throwable cause = e.getCause();
                 if (cause != null) {
                     metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
                             ExceptionUtils.getStackTrace(cause));
@@ -583,7 +589,7 @@ public class TikaResource {
                     metadata.add(TikaCoreProperties.CONTAINER_EXCEPTION,
                             ExceptionUtils.getStackTrace(e));
                 }
-            } else {
+            } else if (! writeLimitReached) {
                 throw e;
             }
         } catch (OutOfMemoryError e) {
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceNoStackTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceNoStackTest.java
index 43a3620..0588697 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceNoStackTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceNoStackTest.java
@@ -17,9 +17,12 @@
 
 package org.apache.tika.server.core;
 
+import static org.apache.tika.TikaTest.assertNotContained;
 import static org.junit.Assert.assertEquals;
 
 import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
 import javax.ws.rs.core.Response;
@@ -29,6 +32,9 @@ import org.apache.cxf.jaxrs.client.WebClient;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.junit.Test;
 
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadata;
 import org.apache.tika.server.core.resource.TikaResource;
 import org.apache.tika.server.core.writer.JSONMessageBodyWriter;
 
@@ -71,7 +77,7 @@ public class TikaResourceNoStackTest extends CXFTestBase {
         Response response = WebClient.create(endPoint + TIKA_PATH).accept(
                 "application/json")
                 .put(ClassLoader.getSystemResourceAsStream(TEST_NULL_POINTER));
-        assertEquals(422, response.getStatus());
+        assertEquals(UNPROCESSEABLE, response.getStatus());
         String content = getStringFromInputStream((InputStream) response.getEntity());
         assertEquals(0, content.length());
     }
@@ -82,9 +88,14 @@ public class TikaResourceNoStackTest extends CXFTestBase {
                 .header("writeLimit", "100")
                 .accept("application/json")
                 .put(ClassLoader.getSystemResourceAsStream(TEST_HELLO_WORLD_LONG));
-        assertEquals(500, response.getStatus());
-        String content = getStringFromInputStream((InputStream) response.getEntity());
-        assertEquals(0, content.length());
+        assertEquals(200, response.getStatus());
+        Metadata metadata =
+                JsonMetadata.fromJson(new InputStreamReader((InputStream) response.getEntity(),
+                        StandardCharsets.UTF_8));
+        assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+        assertContains("When in the Course of human events",
+                metadata.get(TikaCoreProperties.TIKA_CONTENT));
+        assertNotContained("political bands", metadata.get(TikaCoreProperties.TIKA_CONTENT));
     }
 
 }
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
index fc392cb..e818dc7 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaResourceTest.java
@@ -171,6 +171,7 @@ public class TikaResourceTest extends CXFTestBase {
         assertTrue(metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION).startsWith(
                 "org.apache.tika.exception.WriteLimitReachedException"
         ));
+        assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
     }
 
     @Test