You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/06/08 14:42:38 UTC
[tika] branch main updated: TIKA-4039 (#1181)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2d9daef85 TIKA-4039 (#1181)
2d9daef85 is described below
commit 2d9daef859296cad877caf29ad7765c0709472d0
Author: Tim Allison <ta...@apache.org>
AuthorDate: Thu Jun 8 10:42:31 2023 -0400
TIKA-4039 (#1181)
* TIKA-4039 -- allow users to bump attachment limit in UnpackerResource
---
CHANGES.txt | 2 +
.../server/core/resource/UnpackerResource.java | 52 +++++++++++++++-------
.../tika/server/standard/UnpackerResourceTest.java | 9 ++++
3 files changed, 48 insertions(+), 15 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 732de46c4..abb7d94c9 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -3,6 +3,8 @@ Release 2.8.1 - ???
* Changed default decompressConcatenated to true in CompressorParser.
Users may revert to legacy behavior via tika-config.xml (TIKA-4048).
+ * Allow users to modify the attachment limit size in the /unpack resource (TIKA-4039)
+
* Fixed write limit bug in RecursiveParserWrapper (TIKA-4055).
* Add mime detection for many files (TIKA-3992).
diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
index 837dcdebd..6f4bf93c0 100644
--- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
+++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/UnpackerResource.java
@@ -21,8 +21,6 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.tika.server.core.resource.TikaResource.fillMetadata;
import static org.apache.tika.server.core.resource.TikaResource.fillParseContext;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
@@ -45,6 +43,9 @@ import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
+import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.mutable.MutableInt;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -71,7 +72,9 @@ public class UnpackerResource {
public static final String TEXT_FILENAME = "__TEXT__";
public static final String META_FILENAME = "__METADATA__";
- private static final long MAX_ATTACHMENT_BYTES = 100 * 1024 * 1024;
+ public static final String UNPACK_MAX_BYTES_KEY = "unpackMaxBytes";
+
+ private static final long DEFAULT_MAX_ATTACHMENT_BYTES = 100 * 1024 * 1024;
private static final Logger LOG = LoggerFactory.getLogger(UnpackerResource.class);
@@ -115,7 +118,17 @@ public class UnpackerResource {
@Context UriInfo info, boolean saveAll) throws Exception {
Metadata metadata = new Metadata();
ParseContext pc = new ParseContext();
-
+ long unpackMaxBytes = DEFAULT_MAX_ATTACHMENT_BYTES;
+ String unpackMaxBytesString = httpHeaders.getRequestHeaders().getFirst(UNPACK_MAX_BYTES_KEY);
+ if (!StringUtils.isBlank(unpackMaxBytesString)) {
+ unpackMaxBytes = Long.parseLong(unpackMaxBytesString);
+ if (unpackMaxBytes > Integer.MAX_VALUE) {
+ throw new IllegalArgumentException("Can't request value > than Integer" +
+ ".MAX_VALUE : " + unpackMaxBytes);
+ } else if (unpackMaxBytes < 0) {
+ throw new IllegalArgumentException("Can't request value < 0: " + unpackMaxBytes);
+ }
+ }
Parser parser = TikaResource.createParser();
if (parser instanceof DigestingParser) {
//no need to digest for unwrapping
@@ -129,7 +142,7 @@ public class UnpackerResource {
//we need to add this to allow for "inline" use of other parsers.
pc.set(Parser.class, parser);
ContentHandler ch;
- ByteArrayOutputStream text = new ByteArrayOutputStream();
+ UnsynchronizedByteArrayOutputStream text = new UnsynchronizedByteArrayOutputStream();
if (saveAll) {
ch = new BodyContentHandler(
@@ -141,7 +154,8 @@ public class UnpackerResource {
Map<String, byte[]> files = new HashMap<>();
MutableInt count = new MutableInt();
- pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files));
+ pc.set(EmbeddedDocumentExtractor.class, new MyEmbeddedDocumentExtractor(count, files, unpackMaxBytes));
+
TikaResource.parse(parser, LOG, info.getPath(), is, ch, metadata, pc);
if (count.intValue() == 0 && !saveAll) {
@@ -151,7 +165,7 @@ public class UnpackerResource {
if (saveAll) {
files.put(TEXT_FILENAME, text.toByteArray());
- ByteArrayOutputStream metaStream = new ByteArrayOutputStream();
+ UnsynchronizedByteArrayOutputStream metaStream = new UnsynchronizedByteArrayOutputStream();
metadataToCsv(metadata, metaStream);
files.put(META_FILENAME, metaStream.toByteArray());
@@ -163,12 +177,15 @@ public class UnpackerResource {
private static class MyEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
private final MutableInt count;
private final Map<String, byte[]> zout;
+
+ private final long unpackMaxBytes;
private final EmbeddedStreamTranslator embeddedStreamTranslator =
new DefaultEmbeddedStreamTranslator();
- MyEmbeddedDocumentExtractor(MutableInt count, Map<String, byte[]> zout) {
+ MyEmbeddedDocumentExtractor(MutableInt count, Map<String, byte[]> zout, long unpackMaxBytes) {
this.count = count;
this.zout = zout;
+ this.unpackMaxBytes = unpackMaxBytes;
}
public boolean shouldParseEmbedded(Metadata metadata) {
@@ -177,12 +194,17 @@ public class UnpackerResource {
public void parseEmbedded(InputStream inputStream, ContentHandler contentHandler,
Metadata metadata, boolean b) throws SAXException, IOException {
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- BoundedInputStream bis = new BoundedInputStream(MAX_ATTACHMENT_BYTES, inputStream);
+ UnsynchronizedByteArrayOutputStream bos = new UnsynchronizedByteArrayOutputStream();
+
+ BoundedInputStream bis = new BoundedInputStream(unpackMaxBytes, inputStream);
IOUtils.copy(bis, bos);
if (bis.hasHitBound()) {
- throw new IOException(new TikaMemoryLimitException(MAX_ATTACHMENT_BYTES + 1,
- MAX_ATTACHMENT_BYTES));
+ throw new IOException(
+ new TikaMemoryLimitException("An attachment is longer than " +
+ "'unpackMaxBytes' (default=100MB, actual=" + unpackMaxBytes + "). " +
+ "If you need to increase this " +
+ "limit, add a header to your request, such as: unpackMaxBytes: " +
+ "1073741824. There is a hard limit of 2GB."));
}
byte[] data = bos.toByteArray();
@@ -205,11 +227,11 @@ public class UnpackerResource {
LOG.warn("Unexpected MimeTypeException", e);
}
}
- try (InputStream is = new ByteArrayInputStream(data)) {
+ try (InputStream is = new UnsynchronizedByteArrayInputStream(data)) {
if (embeddedStreamTranslator.shouldTranslate(is, metadata)) {
InputStream translated = embeddedStreamTranslator
- .translate(new ByteArrayInputStream(data), metadata);
- ByteArrayOutputStream bos2 = new ByteArrayOutputStream();
+ .translate(new UnsynchronizedByteArrayInputStream(data), metadata);
+ UnsynchronizedByteArrayOutputStream bos2 = new UnsynchronizedByteArrayOutputStream();
IOUtils.copy(translated, bos2);
data = bos2.toByteArray();
}
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
index cea91fb9b..82e3ba482 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
@@ -211,6 +211,15 @@ public class UnpackerResourceTest extends CXFTestBase {
assertContains("dc:creator,Maxim Valyanskiy", responseMsg);
}
+ @Test
+ public void testMaxBytes() throws Exception {
+ Response response = WebClient.create(CXFTestBase.endPoint + ALL_PATH)
+ .header(CONTENT_TYPE, APPLICATION_XML)
+ .header(UnpackerResource.UNPACK_MAX_BYTES_KEY, 100).accept("application/zip")
+ .put(ClassLoader.getSystemResourceAsStream("test-documents/pic.xls"));
+ assertEquals(422, response.getStatus());
+ }
+
@Test
public void testPDFImages() throws Exception {
Response response = WebClient.create(CXFTestBase.endPoint + UNPACKER_PATH)