You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/30 18:51:16 UTC

[tika] branch branch_1x updated: TIKA-3372 -- fix write limit handling in the PDFParser

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 2f80958  TIKA-3372 -- fix write limit handling in the PDFParser
2f80958 is described below

commit 2f80958c5e2264dd4b9007f331d50d1012cc8fa0
Author: tballison <ta...@apache.org>
AuthorDate: Fri Apr 30 14:49:06 2021 -0400

    TIKA-3372 -- fix write limit handling in the PDFParser
---
 .../apache/tika/parser/RecursiveParserWrapper.java | 17 +++++++++++++---
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 23 ++++++++++++++++++----
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 13 ++++++++++++
 .../tika/server/RecursiveMetadataResourceTest.java | 15 ++++++++++++++
 .../org/apache/tika/server/TikaResourceTest.java   | 19 ++++++++++++++++++
 5 files changed, 80 insertions(+), 7 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 95899a6..6992231 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -493,7 +493,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
             super.characters(ch, start, availableLength);
             totalChars += availableLength;
             if (availableLength < length) {
-                throw new WriteLimitReached();
+                throw new WriteLimitReached(totalWriteLimit);
             }
         }
 
@@ -507,13 +507,24 @@ public class RecursiveParserWrapper extends ParserDecorator {
             super.ignorableWhitespace(ch, start, availableLength);
 
             if (availableLength < length) {
-                throw new WriteLimitReached();
+                throw new WriteLimitReached(totalWriteLimit);
             }
             totalChars += availableLength;
         }
     }
 
     private static class WriteLimitReached extends SAXException {
-
+        final int writeLimit;
+        WriteLimitReached(int writeLimit) {
+            this.writeLimit = writeLimit;
+        }
+        @Override
+        public String getMessage() {
+            return "Your document contained more than " + writeLimit
+                    + " characters, and so your requested limit has been"
+                    + " reached. To receive the full text of the document,"
+                    + " increase your limit. (Text up to the limit is"
+                    + " however available).";
+        }
     }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index f930c61..2cc610e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -99,6 +99,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.sax.EmbeddedContentHandler;
@@ -425,10 +426,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
     void handleCatchableIOE(IOException e) throws IOException {
         if (config.getCatchIntermediateIOExceptions()) {
-            if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
-                    e.getCause().getMessage().contains("Your document contained more than")) {
-                //TODO -- is there a cleaner way of checking for:
-                // WriteOutContentHandler.WriteLimitReachedException?
+
+            if (isWriteLimitReached(e, 0)) {
                 throw e;
             }
 
@@ -443,6 +442,22 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
+    boolean isWriteLimitReached(Throwable t, int depth) {
+        if (depth > MAX_RECURSION_DEPTH) {
+            return false;
+        }
+        if (t == null) {
+            return false;
+        }
+        if (t instanceof SAXException) {
+
+            String msg = t.getMessage();
+            if (msg != null && msg.contains("Your document contained more than")) {
+                return true;
+            }
+        }
+        return isWriteLimitReached(t.getCause(), depth + 1);
+    }
     void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
         if (config.getOcrStrategy().equals(NO_OCR)) {
             return;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 572087d..f92fdfd 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -37,6 +37,8 @@ import org.apache.pdfbox.util.Matrix;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.WriteOutContentHandler;
+
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -114,11 +116,22 @@ class PDF2XHTML extends AbstractPDF2XHTML {
             }
         }
         if (pdf2XHTML.exceptions.size() > 0) {
+            tryWriteLimitReached(pdf2XHTML.exceptions);
             //throw the first
             throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0));
         }
     }
 
+    private static void tryWriteLimitReached(List<IOException> exceptions) {
+        WriteOutContentHandler tmp = new WriteOutContentHandler();
+        for (IOException e : exceptions) {
+            if (tmp.isWriteLimitReached(e)) {
+
+            }
+
+        }
+    }
+
     @Override
     public void processPage(PDPage page) throws IOException {
         try {
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index 6a70d34..83f30fe 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -406,4 +406,19 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
                 metadata.get(AbstractRecursiveParserWrapperHandler.CONTAINER_EXCEPTION));
 
     }
+
+    @Test
+    public void testWriteLimitInPDF() throws Exception {
+        int writeLimit = 10;
+        Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
+                .header("writeLimit", Integer.toString(writeLimit))
+                .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+
+        assertEquals(200, response.getStatus());
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+        Metadata metadata = metadataList.get(0);
+        assertEquals("true",
+                metadata.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
+    }
 }
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 7511821..9e29892 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -17,6 +17,7 @@
 
 package org.apache.tika.server;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
@@ -25,6 +26,7 @@ import static org.junit.Assert.assertTrue;
 import java.io.FileNotFoundException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.Reader;
 import java.net.URISyntaxException;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
@@ -694,9 +696,26 @@ public class TikaResourceTest extends CXFTestBase {
         assertTrue(metadata.get(AbstractRecursiveParserWrapperHandler.CONTAINER_EXCEPTION).startsWith(
                 "org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException"
         ));
+        assertEquals("true",
+                metadata.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
     }
 
     @Test
+    public void testWriteLimitInPDF() throws Exception {
+        int writeLimit = 10;
+        Response response = WebClient.create(endPoint + TIKA_PATH).accept("application/json")
+                .header("writeLimit", Integer.toString(writeLimit))
+                .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+
+        assertEquals(200, response.getStatus());
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        Metadata metadata = JsonMetadata.fromJson(reader);
+        assertEquals("true",
+                metadata.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
+
+    }
+
+        @Test
     public void testJsonHandlerType() throws Exception {
         Response response = WebClient.create(endPoint + TIKA_PATH)
                 .accept("application/json")