You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/30 18:51:16 UTC
[tika] branch branch_1x updated: TIKA-3372 -- fix write limit
handling in the PDFParser
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 2f80958 TIKA-3372 -- fix write limit handling in the PDFParser
2f80958 is described below
commit 2f80958c5e2264dd4b9007f331d50d1012cc8fa0
Author: tballison <ta...@apache.org>
AuthorDate: Fri Apr 30 14:49:06 2021 -0400
TIKA-3372 -- fix write limit handling in the PDFParser
---
.../apache/tika/parser/RecursiveParserWrapper.java | 17 +++++++++++++---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 23 ++++++++++++++++++----
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 13 ++++++++++++
.../tika/server/RecursiveMetadataResourceTest.java | 15 ++++++++++++++
.../org/apache/tika/server/TikaResourceTest.java | 19 ++++++++++++++++++
5 files changed, 80 insertions(+), 7 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 95899a6..6992231 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -493,7 +493,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
super.characters(ch, start, availableLength);
totalChars += availableLength;
if (availableLength < length) {
- throw new WriteLimitReached();
+ throw new WriteLimitReached(totalWriteLimit);
}
}
@@ -507,13 +507,24 @@ public class RecursiveParserWrapper extends ParserDecorator {
super.ignorableWhitespace(ch, start, availableLength);
if (availableLength < length) {
- throw new WriteLimitReached();
+ throw new WriteLimitReached(totalWriteLimit);
}
totalChars += availableLength;
}
}
private static class WriteLimitReached extends SAXException {
-
+ final int writeLimit;
+ WriteLimitReached(int writeLimit) {
+ this.writeLimit = writeLimit;
+ }
+ @Override
+ public String getMessage() {
+ return "Your document contained more than " + writeLimit
+ + " characters, and so your requested limit has been"
+ + " reached. To receive the full text of the document,"
+ + " increase your limit. (Text up to the limit is"
+ + " however available).";
+ }
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index f930c61..2cc610e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -99,6 +99,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.sax.EmbeddedContentHandler;
@@ -425,10 +426,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
void handleCatchableIOE(IOException e) throws IOException {
if (config.getCatchIntermediateIOExceptions()) {
- if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
- e.getCause().getMessage().contains("Your document contained more than")) {
- //TODO -- is there a cleaner way of checking for:
- // WriteOutContentHandler.WriteLimitReachedException?
+
+ if (isWriteLimitReached(e, 0)) {
throw e;
}
@@ -443,6 +442,22 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ boolean isWriteLimitReached(Throwable t, int depth) {
+ if (depth > MAX_RECURSION_DEPTH) {
+ return false;
+ }
+ if (t == null) {
+ return false;
+ }
+ if (t instanceof SAXException) {
+
+ String msg = t.getMessage();
+ if (msg != null && msg.contains("Your document contained more than")) {
+ return true;
+ }
+ }
+ return isWriteLimitReached(t.getCause(), depth + 1);
+ }
void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
if (config.getOcrStrategy().equals(NO_OCR)) {
return;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 572087d..f92fdfd 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -37,6 +37,8 @@ import org.apache.pdfbox.util.Matrix;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.WriteOutContentHandler;
+
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -114,11 +116,22 @@ class PDF2XHTML extends AbstractPDF2XHTML {
}
}
if (pdf2XHTML.exceptions.size() > 0) {
+ tryWriteLimitReached(pdf2XHTML.exceptions);
//throw the first
throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0));
}
}
+ private static void tryWriteLimitReached(List<IOException> exceptions) {
+ WriteOutContentHandler tmp = new WriteOutContentHandler();
+ for (IOException e : exceptions) {
+ if (tmp.isWriteLimitReached(e)) {
+
+ }
+
+ }
+ }
+
@Override
public void processPage(PDPage page) throws IOException {
try {
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
index 6a70d34..83f30fe 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataResourceTest.java
@@ -406,4 +406,19 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
metadata.get(AbstractRecursiveParserWrapperHandler.CONTAINER_EXCEPTION));
}
+
+ @Test
+ public void testWriteLimitInPDF() throws Exception {
+ int writeLimit = 10;
+ Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
+ .header("writeLimit", Integer.toString(writeLimit))
+ .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+
+ assertEquals(200, response.getStatus());
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ Metadata metadata = metadataList.get(0);
+ assertEquals("true",
+ metadata.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
+ }
}
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 7511821..9e29892 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -17,6 +17,7 @@
package org.apache.tika.server;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
@@ -25,6 +26,7 @@ import static org.junit.Assert.assertTrue;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.io.Reader;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
@@ -694,9 +696,26 @@ public class TikaResourceTest extends CXFTestBase {
assertTrue(metadata.get(AbstractRecursiveParserWrapperHandler.CONTAINER_EXCEPTION).startsWith(
"org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException"
));
+ assertEquals("true",
+ metadata.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
}
@Test
+ public void testWriteLimitInPDF() throws Exception {
+ int writeLimit = 10;
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept("application/json")
+ .header("writeLimit", Integer.toString(writeLimit))
+ .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+
+ assertEquals(200, response.getStatus());
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ Metadata metadata = JsonMetadata.fromJson(reader);
+ assertEquals("true",
+ metadata.get(AbstractRecursiveParserWrapperHandler.WRITE_LIMIT_REACHED));
+
+ }
+
+ @Test
public void testJsonHandlerType() throws Exception {
Response response = WebClient.create(endPoint + TIKA_PATH)
.accept("application/json")