You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/30 21:39:50 UTC

[tika] branch main updated: TIKA-3372 -- fix writelimit in PDFs

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 7ea14be  TIKA-3372 -- fix writelimit in PDFs
7ea14be is described below

commit 7ea14beb129e5a2ddfeff5ae14c7ef88de9f3875
Author: tballison <ta...@apache.org>
AuthorDate: Fri Apr 30 17:39:34 2021 -0400

    TIKA-3372 -- fix writelimit in PDFs
---
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 13 ++++++-----
 .../org/apache/tika/parser/pdf/PDFParserTest.java  | 27 ++++++++++++++++++++++
 .../classic/RecursiveMetadataResourceTest.java     | 18 +++++++++++++++
 .../tika/server/classic/TikaResourceTest.java      | 20 ++++++++++++++++
 4 files changed, 72 insertions(+), 6 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 5a6d9ab..75d43d2 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -95,6 +95,7 @@ import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TemporaryResources;
@@ -426,13 +427,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     }
 
     void handleCatchableIOE(IOException e) throws IOException {
+
+        if (WriteLimitReachedException.isWriteLimitReached(e)) {
+            metadata.set(TikaCoreProperties.WRITE_LIMIT_REACHED, "true");
+            throw e;
+        }
+
         if (config.isCatchIntermediateIOExceptions()) {
-            if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
-                    e.getCause().getMessage().contains("Your document contained more than")) {
-                //TODO -- is there a cleaner way of checking for:
-                // WriteOutContentHandler.WriteLimitReachedException?
-                throw e;
-            }
 
             String msg = e.getMessage();
             if (msg == null) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 7a9ddf5..57659d5 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1354,4 +1354,31 @@ public class PDFParserTest extends TikaTest {
         assertEquals("15036", metadata.get(1).get(Metadata.CONTENT_LENGTH));
     }
 
+    /**
+    @Test
+    public void testWriteLimit() throws Exception {
+        for (int i = 0; i < 10000; i += 13) {
+            Metadata metadata = testWriteLimit("testPDF_childAttachments.pdf", i);
+            assertEquals("true", metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+            int len = metadata.get(TikaCoreProperties.TIKA_CONTENT).length();
+            System.out.println(len + " : " + i);
+            assertTrue(len <= i);
+        }
+    }
+
+    private Metadata testWriteLimit(String fileName, int limit) throws Exception {
+        BasicContentHandlerFactory factory = new BasicContentHandlerFactory(
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT, limit
+        );
+        ContentHandler contentHandler = factory.getNewContentHandler();
+        Metadata metadata = new Metadata();
+        ParseContext parseContext = new ParseContext();
+        try (InputStream is = getResourceAsStream("/test-documents/" + fileName)) {
+            AUTO_DETECT_PARSER.parse(is, contentHandler, metadata, parseContext);
+        } catch (WriteLimitReachedException e) {
+            //e.printStackTrace();
+        }
+        metadata.set(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString());
+        return metadata;
+    }*/
 }
diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
index fc09f4d..626e0ef 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/RecursiveMetadataResourceTest.java
@@ -39,11 +39,13 @@ import org.apache.cxf.jaxrs.ext.multipart.Attachment;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.junit.Test;
 
+import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.apache.tika.server.core.CXFTestBase;
 import org.apache.tika.server.core.resource.RecursiveMetadataResource;
 import org.apache.tika.server.core.writer.MetadataListMessageBodyWriter;
@@ -364,4 +366,20 @@ public class RecursiveMetadataResourceTest extends CXFTestBase {
 
     }
 
+    @Test
+    public void testWriteLimitInPDF() throws Exception {
+        int writeLimit = 10;
+        Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
+                .header("writeLimit", Integer.toString(writeLimit))
+                .put(ClassLoader.getSystemResourceAsStream("test-documents/testPDFTwoTextBoxes" +
+                        ".pdf"));
+
+        assertEquals(200, response.getStatus());
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+        Metadata metadata = metadataList.get(0);
+        assertEquals("true",
+                metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+    }
+
 }
diff --git a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
index a8bb8d3..53c801e 100644
--- a/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
+++ b/tika-server/tika-server-classic/src/test/java/org/apache/tika/server/classic/TikaResourceTest.java
@@ -17,13 +17,16 @@
 
 package org.apache.tika.server.classic;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.apache.cxf.helpers.HttpHeaderHelper.CONTENT_ENCODING;
+import static org.apache.tika.TikaTest.debug;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
@@ -48,6 +51,7 @@ import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.serialization.JsonMetadata;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.apache.tika.server.classic.config.PDFServerConfig;
 import org.apache.tika.server.classic.config.TesseractServerConfig;
 import org.apache.tika.server.core.CXFTestBase;
@@ -602,4 +606,20 @@ public class TikaResourceTest extends CXFTestBase {
         assertNotFound("embed4.txt", metadata.get(TikaCoreProperties.TIKA_CONTENT));
 
     }
+
+    @Test
+    public void testWriteLimitInPDF() throws Exception {
+        int writeLimit = 10;
+        Response response = WebClient.create(endPoint + TIKA_PATH).accept("application/json")
+                .header("writeLimit", Integer.toString(writeLimit))
+                .put(ClassLoader.getSystemResourceAsStream(
+                        "test-documents/testPDFTwoTextBoxes.pdf"));
+
+        assertEquals(200, response.getStatus());
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        Metadata metadata = JsonMetadata.fromJson(reader);
+        assertEquals("true",
+                metadata.get(TikaCoreProperties.WRITE_LIMIT_REACHED));
+
+    }
 }