You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/07 20:42:42 UTC
[tika] branch main updated: TIKA-3919 -- Use BoundedInputStream instead of LookaheadInputStream in TikaInputStream and in StreamingZipContainerDetector to improve memory usage.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new b2e5df17c TIKA-3919 -- Use BoundedInputStream instead of LookaheadInputStream in TikaInputStream and in StreamingZipContainerDetector to improve memory usage.
b2e5df17c is described below
commit b2e5df17cbf02e73733c0985ae47c6fc63af0a68
Author: tballison <ta...@apache.org>
AuthorDate: Mon Nov 7 15:42:31 2022 -0500
TIKA-3919 -- Use BoundedInputStream instead of LookaheadInputStream in TikaInputStream and in StreamingZipContainerDetector to improve memory usage.
---
.../src/main/java/org/apache/tika/io/TikaInputStream.java | 10 +++++++---
.../apache/tika/detect/zip/StreamingZipContainerDetector.java | 8 +++++---
2 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 76db4a2a3..62daabcc0 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -710,12 +710,16 @@ public class TikaInputStream extends TaggedInputStream {
} else {
Path tmpFile = tmp.createTempFile(suffix);
if (maxBytes > -1) {
- try (InputStream lookAhead = new LookaheadInputStream(this, maxBytes)) {
- Files.copy(lookAhead, tmpFile, REPLACE_EXISTING);
- if (Files.size(tmpFile) >= maxBytes) {
+ this.mark(maxBytes);
+ try (BoundedInputStream boundedInputStream =
+ new BoundedInputStream(maxBytes, this)) {
+ Files.copy(boundedInputStream, tmpFile, REPLACE_EXISTING);
+ if (boundedInputStream.hasHitBound()) {
//tmpFile will be cleaned up when this TikaInputStream is closed
return null;
}
+ } finally {
+ this.reset();
}
} else {
// Spool the entire stream into a temporary file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java
index c96e01323..b4a8c6771 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java
@@ -25,7 +25,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.tika.config.LoadErrorHandler;
import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.io.LookaheadInputStream;
+import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -80,9 +80,11 @@ public class StreamingZipContainerDetector extends DefaultZipContainerDetector {
if (type == TIFF) {
return TIFF;
} else if (isZipArchive(type)) {
-
- try (LookaheadInputStream lookahead = new LookaheadInputStream(input, markLimit)) {
+ input.mark(markLimit);
+ try (BoundedInputStream lookahead = new BoundedInputStream(markLimit, input)) {
return detectStreaming(lookahead, metadata);
+ } finally {
+ input.reset();
}
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;