You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/04/19 17:45:58 UTC
[tika] branch TIKA-2849 updated: TIKA-2849 -- add similar treatment for POIFSContainerDetector thanks to Jukka!

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-2849
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-2849 by this push:
     new d7d69f6  TIKA-2849 -- add similar treatment for POIFSContainerDetector thanks to Jukka!
d7d69f6 is described below

commit d7d69f66258a861affe7150cc42b874961fc986a
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Apr 19 13:45:44 2019 -0400

    TIKA-2849 -- add similar treatment for POIFSContainerDetector thanks
    to Jukka!
---
 CHANGES.txt                                        |  4 ++-
 .../java/org/apache/tika/io/TikaInputStream.java   | 35 ++++++++++++++++++++--
 .../parser/microsoft/POIFSContainerDetector.java   | 22 +++++++++++---
 .../tika/parser/pkg/ZipContainerDetector.java      | 32 ++++++--------------
 .../tika/parser/pkg/ZipContainerDetectorTest.java  | 15 ++++------
 5 files changed, 68 insertions(+), 40 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 61d4c7c..af8fcd0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -11,7 +11,9 @@ Release 1.21 - ????
    * The ZipContainerDetector's default behavior was changed to run
      streaming detection up to its markLimit.  Users can get the
      legacy behavior (spool-to-file/rely-on-underlying-file-in-TikaInputStream)
-     by setting "spoolToFile" to true and using a TikaInputStream (TIKA-2849).
+     by setting markLimit=-1. The POIFSContainerDetector requires an underlying file;
+     it will try to spool the file to disk; if the file's length is > markLimit,
+     it will not attempt detection; set markLimit to -1 for legacy behavior (TIKA-2849).
 
    * Upgrade PDFBox to 2.0.14 (TIKA-2834).
 
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 96f922f..855ab28 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -623,14 +623,45 @@ public class TikaInputStream extends TaggedInputStream {
         return path != null;
     }
 
+
+    /**
+     * If the user created this TikaInputStream with a file,
+     * the original file will be returned.  If not, the entire stream
+     * will be spooled to a temporary file which will be deleted
+     * upon the close of this TikaInputStream
+     * @return
+     * @throws IOException
+     */
     public Path getPath() throws IOException {
+        return getPath(-1);
+    }
+
+    /**
+     *
+     * @param maxBytes if this is less than 0 and if an underlying file doesn't already exist,
+     *                 the full file will be spooled to disk
+     * @return the original path used in the initialization of this TikaInputStream,
+     * a temporary file if the stream was shorter than <code>maxBytes</code>, or <code>null</code>
+     * if the underlying stream was longer than maxBytes.
+     * @throws IOException
+     */
+    public Path getPath(int maxBytes) throws IOException {
         if (path == null) {
             if (position > 0) {
                 throw new IOException("Stream is already being read");
             } else {
-                // Spool the entire stream into a temporary file
                 path = tmp.createTempFile();
-                Files.copy(in, path, REPLACE_EXISTING);
+                if (maxBytes > -1) {
+                    try (InputStream lookAhead = new LookaheadInputStream(in, maxBytes)) {
+                        Files.copy(lookAhead, path, REPLACE_EXISTING);
+                        if (Files.size(path) >= maxBytes) {
+                            return null;
+                        }
+                    }
+                } else {
+                    // Spool the entire stream into a temporary file
+                    Files.copy(in, path, REPLACE_EXISTING);
+                }
 
                 // Create a new input stream and make sure it'll get closed
                 InputStream newStream = Files.newInputStream(path);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
index 6f32984..576cf52 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -16,13 +16,13 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import static org.apache.tika.mime.MediaType.OCTET_STREAM;
 import static org.apache.tika.mime.MediaType.application;
 
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
@@ -35,7 +35,9 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.DocumentNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.config.Field;
 import org.apache.tika.detect.Detector;
+import org.apache.tika.io.LookaheadInputStream;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -175,6 +177,13 @@ public class POIFSContainerDetector implements Detector {
      */
     private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
 
+    @Field
+    private int markLimit = 16 * 1024 * 1024;
+
+    public void setMarkLimit(int markLimit) {
+        this.markLimit = markLimit;
+    }
+
     /**
      * Internal detection of the specific kind of OLE2 document, based on the
      * names of the top level streams within the file.
@@ -379,14 +388,19 @@ public class POIFSContainerDetector implements Detector {
         return false;
     }
 
-    private static Set<String> getTopLevelNames(TikaInputStream stream)
+    private Set<String> getTopLevelNames(TikaInputStream stream)
             throws IOException {
         // Force the document stream to a (possibly temporary) file
         // so we don't modify the current position of the stream
-        File file = stream.getFile();
+        Path file = stream.getPath(markLimit);
+
+        //if the stream was longer than markLimit, don't detect
+        if (file == null) {
+            return Collections.emptySet();
+        }
 
         try {
-            POIFSFileSystem fs = new POIFSFileSystem(file, true);
+            POIFSFileSystem fs = new POIFSFileSystem(file.toFile(), true);
 
             // Optimize a possible later parsing process by keeping
             // a reference to the already opened POI file system
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index d9770f7..7007b7c6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -46,6 +46,7 @@ import org.apache.poi.openxml4j.util.ZipFileZipEntrySource;
 import org.apache.tika.config.Field;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.io.BoundedInputStream;
+import org.apache.tika.io.LookaheadInputStream;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -90,10 +91,7 @@ public class ZipContainerDetector implements Detector {
     //this has to be > 100,000 to handle some of the iworks files
     //in our unit tests
     @Field
-    int markLimit = 500_000;
-
-    @Field
-    boolean spoolToFile = false;
+    int markLimit = 16 * 1024 * 1024;
 
     public MediaType detect(InputStream input, Metadata metadata)
             throws IOException {
@@ -119,7 +117,7 @@ public class ZipContainerDetector implements Detector {
 
             if (TikaInputStream.isTikaInputStream(input)) {
                 TikaInputStream tis = TikaInputStream.cast(input);
-                if (spoolToFile) {
+                if (markLimit < 0) {
                     tis.getFile();
                 }
                 if (tis.hasFile()) {
@@ -127,12 +125,8 @@ public class ZipContainerDetector implements Detector {
                 }
             }
 
-            input.mark(markLimit);
-            try {
-                return StreamingZipContainerDetector.detect(
-                        new BoundedInputStream(markLimit, input));
-            } finally {
-                input.reset();
+            try (LookaheadInputStream lookahead = new LookaheadInputStream(input, markLimit)) {
+                return StreamingZipContainerDetector.detect(lookahead);
             }
         } else if (!type.equals(MediaType.OCTET_STREAM)) {
             return type;
@@ -142,6 +136,10 @@ public class ZipContainerDetector implements Detector {
     }
 
     /**
+     * If this is less than 0, the file will be spooled to disk,
+     * and detection will run on the full file.
+     * If this is greater than 0, the {@link StreamingZipContainerDetector}
+     * will be called only up to the markLimit.
      *
      * @param markLimit mark limit for streaming detection
      */
@@ -149,18 +147,6 @@ public class ZipContainerDetector implements Detector {
         this.markLimit = markLimit;
     }
 
-    /**
-     * Before version 1.21, if a user passed in a {@link TikaInputStream},
-     * the entire stream would be spooled to a file for Zip detection.
-     * With Tika 1.21, the default is now <code>false</code>, which means
-     * that the {@link ZipContainerDetector} will try streaming detection
-     * up to the {@link ZipContainerDetector#markLimit} on all streams.
-     * To revert to the legacy (pre 1.21) behavior, set this to <code>true</code>
-     * @param spoolToFile
-     */
-    public void setSpoolToFile(boolean spoolToFile) {
-        this.spoolToFile = spoolToFile;
-    }
 
     private static MediaType detectCompressorFormat(byte[] prefix, int length) {
         try {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
index f4f5059..8c92986 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
@@ -110,6 +110,7 @@ public class ZipContainerDetectorTest extends TikaTest {
         }
     }
 
+    @Ignore("for offline testing")
     @Test
     public void timeDetection() throws Exception {
         TikaConfig config = TikaConfig.getDefaultConfig();
@@ -129,19 +130,16 @@ public class ZipContainerDetectorTest extends TikaTest {
                     mediaTypeSet.add(mt);
                 }
                 nonTikaStream += System.currentTimeMillis()-start;
-            }
-            for (File z : zips) {
-                long start = System.currentTimeMillis();
+
+                start = System.currentTimeMillis();
                 try (InputStream is = TikaInputStream.get(
                         new BufferedInputStream(new FileInputStream(z)))) {
                     MediaType mt = detector.detect(is, new Metadata());
                     mediaTypeSet.add(mt);
                 }
                 tikaStream += System.currentTimeMillis()-start;
-            }
 
-            for (File z : zips) {
-                long start = System.currentTimeMillis();
+                start = System.currentTimeMillis();
                 try (InputStream is = TikaInputStream.get(z)) {
                     MediaType mt = detector.detect(is, new Metadata());
                     mediaTypeSet.add(mt);
@@ -171,10 +169,7 @@ public class ZipContainerDetectorTest extends TikaTest {
                     getRecursiveMetadata(is, true);
                 }
                 stream += System.currentTimeMillis()-start;
-            }
-
-            for (File z : zips) {
-                long start = System.currentTimeMillis();
+                start = System.currentTimeMillis();
                 try (InputStream is = TikaInputStream.get(z)) {
                     getRecursiveMetadata(is, true);
                 }