You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/30 21:39:50 UTC
[tika] branch main updated: TIKA-3372 -- fix writelimit in PDFs
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 7ea14be TIKA-3372 -- fix writelimit in PDFs
7ea14be is described below
commit 7ea14beb129e5a2ddfeff5ae14c7ef88de9f3875
Author: tballison <ta...@apache.org>
AuthorDate: Fri Apr 30 17:39:34 2021 -0400
TIKA-3372 -- fix writelimit in PDFs
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 13 ++++++-----
.../org/apache/tika/parser/pdf/PDFParserTest.java | 27 ++++++++++++++++++++++
.../classic/RecursiveMetadataResourceTest.java | 18 +++++++++++++++
.../tika/server/classic/TikaResourceTest.java | 20 ++++++++++++++++
4 files changed, 72 insertions(+), 6 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 5a6d9ab..75d43d2 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -95,6 +95,7 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
@@ -426,13 +427,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
void handleCatchableIOE(IOException e) throws IOException {
+
+ if (WriteLimitReachedException.isWriteLimitReached(e)) {
+ metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
+ throw e;
+ }
+
if (config.isCatchIntermediateIOExceptions()) {
- if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
- e.getCause().getMessage().contains("Your document contained more than")) {
- //TODO -- is there a cleaner way of checking for:
- // WriteOutContentHandler.WriteLimitReachedException?
- throw e;
- }
String msg = e.getMessage();
if (msg == null) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 7a9ddf5..57659d5 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1354,4 +1354,31 @@ public class PDFParserTest extends TikaTest {
assertEquals("15036", metadata.get(1).get(Metadata.CONTENT_LENGTH));
}
+ /**
+ @Test
+ public void testWriteLimit() throws Exception {
+ for (int i = 0; i < 10000; i += 13) {
+ Metadata metadata = testWriteLimit("testPDF_childAttachments.pdf", i);
+ assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+ int len = metadata.get(TikaCoreProperties.TIKA_CONTENT).length();
+ System.out.println(len + " : " + i);
+ assertTrue(len <= i);
+ }
+ }
+
+ private Metadata testWriteLimit(String fileName, int limit) throws Exception {
+ BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, limit
+ );
+ ContentHandler contentHandler = factory.getNewContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext parseContext = new ParseContext();
+ try (InputStream is = getResourceAsStream("/test-documents/" + fileName)) {
+ AUTO_DETECT_PARSER.parse(is, contentHandler, metadata, parseContext);
+ } catch (WriteLimitReachedException e) {
+ //e.printStackTrace();
+ }
+ metadata.set(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString());
+ return metadata;
+ }*/
}
diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
index fc09f4d..626e0ef 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
@@ -39,11 +39,13 @@ import org.apache.cxf.jaxrs.ext.multipart.Attachment;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.junit.Test;
+import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.server.core.CXFTestBase;
import org.apache.tika.server.core.resource.RecursiveMetadataResource;
import org.apache.tika.server.core.writer.MetadataListMessageBodyWriter;
@@ -364,4 +366,20 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
}
+ @Test
+ public void testWriteLimitInPDF() throws Exception {
+ int writeLimit = 10;
+ Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
+ .header("writeLimit", Integer.toString(writeLimit))
+ .put(ClassLoader.getSystemResourceAsStream("test-documents/testPDFTwoTextBoxes" +
+ ".pdf"));
+
+ assertEquals(200, response.getStatus());
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ Metadata metadata = metadataList.get(0);
+ assertEquals("true",
+ metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+ }
+
}
diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
index a8bb8d3..53c801e 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
@@ -17,13 +17,16 @@
package org.apache.tika.server.classic;
+import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
+import static org.apache.tika.TikaTest.debug;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@@ -48,6 +51,7 @@ import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.serialization.JsonMetadata;
import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.server.classic.config.PDFServerConfig;
import org.apache.tika.server.classic.config.TesseractServerConfig;
import org.apache.tika.server.core.CXFTestBase;
@@ -602,4 +606,20 @@ public class TikaResourceTest extends CXFTestBase {
assertNotFound("embed4.txt", metadata.get(TikaCoreProperties.TIKA_CONTENT));
}
+
+ @Test
+ public void testWriteLimitInPDF() throws Exception {
+ int writeLimit = 10;
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept("application/json")
+ .header("writeLimit", Integer.toString(writeLimit))
+ .put(ClassLoader.getSystemResourceAsStream(
+ "test-documents/testPDFTwoTextBoxes.pdf"));
+
+ assertEquals(200, response.getStatus());
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ Metadata metadata = JsonMetadata.fromJson(reader);
+ assertEquals("true",
+ metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+
+ }
}