You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/07 20:42:42 UTC

[tika] branch main updated: TIKA-3919 -- Use BoundedInputStream instead of LookaheadInputStream in TikaInputStream and in StreamingZipContainerDetector to improve memory usage.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new b2e5df17c TIKA-3919 -- Use BoundedInputStream instead of LookaheadInputStream in TikaInputStream and in StreamingZipContainerDetector to improve memory usage.
b2e5df17c is described below

commit b2e5df17cbf02e73733c0985ae47c6fc63af0a68
Author: tballison <ta...@apache.org>
AuthorDate: Mon Nov 7 15:42:31 2022 -0500

    TIKA-3919 -- Use BoundedInputStream instead of LookaheadInputStream in TikaInputStream and in StreamingZipContainerDetector to improve memory usage.
---
 .../src/main/java/org/apache/tika/io/TikaInputStream.java      | 10 +++++++---
 .../apache/tika/detect/zip/StreamingZipContainerDetector.java  |  8 +++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 76db4a2a3..62daabcc0 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -710,12 +710,16 @@ public class TikaInputStream extends TaggedInputStream {
             } else {
                 Path tmpFile = tmp.createTempFile(suffix);
                 if (maxBytes > -1) {
-                    try (InputStream lookAhead = new LookaheadInputStream(this, maxBytes)) {
-                        Files.copy(lookAhead, tmpFile, REPLACE_EXISTING);
-                        if (Files.size(tmpFile) >= maxBytes) {
+                    this.mark(maxBytes);
+                    try (BoundedInputStream boundedInputStream =
+                                 new BoundedInputStream(maxBytes, this)) {
+                        Files.copy(boundedInputStream, tmpFile, REPLACE_EXISTING);
+                        if (boundedInputStream.hasHitBound()) {
                             //tmpFile will be cleaned up when this TikaInputStream is closed
                             return null;
                         }
+                    } finally {
+                        this.reset();
                     }
                 } else {
                     // Spool the entire stream into a temporary file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java
index c96e01323..b4a8c6771 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-zip-commons/src/main/java/org/apache/tika/detect/zip/StreamingZipContainerDetector.java
@@ -25,7 +25,7 @@ import org.apache.commons.io.IOUtils;
 
 import org.apache.tika.config.LoadErrorHandler;
 import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.io.LookaheadInputStream;
+import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
@@ -80,9 +80,11 @@ public class StreamingZipContainerDetector extends DefaultZipContainerDetector {
         if (type == TIFF) {
             return TIFF;
         } else if (isZipArchive(type)) {
-
-            try (LookaheadInputStream lookahead = new LookaheadInputStream(input, markLimit)) {
+            input.mark(markLimit);
+            try (BoundedInputStream lookahead = new BoundedInputStream(markLimit, input)) {
                 return detectStreaming(lookahead, metadata);
+            } finally {
+                input.reset();
             }
         } else if (!type.equals(MediaType.OCTET_STREAM)) {
             return type;