You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/04/19 17:45:58 UTC
[tika] branch TIKA-2849 updated: TIKA-2849 -- add similar treatment
for POIFSContainerDetector thanks to Jukka!
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-2849
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-2849 by this push:
new d7d69f6 TIKA-2849 -- add similar treatment for POIFSContainerDetector thanks to Jukka!
d7d69f6 is described below
commit d7d69f66258a861affe7150cc42b874961fc986a
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Apr 19 13:45:44 2019 -0400
TIKA-2849 -- add similar treatment for POIFSContainerDetector thanks
to Jukka!
---
CHANGES.txt | 4 ++-
.../java/org/apache/tika/io/TikaInputStream.java | 35 ++++++++++++++++++++--
.../parser/microsoft/POIFSContainerDetector.java | 22 +++++++++++---
.../tika/parser/pkg/ZipContainerDetector.java | 32 ++++++--------------
.../tika/parser/pkg/ZipContainerDetectorTest.java | 15 ++++------
5 files changed, 68 insertions(+), 40 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 61d4c7c..af8fcd0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -11,7 +11,9 @@ Release 1.21 - ????
* The ZipContainerDetector's default behavior was changed to run
streaming detection up to its markLimit. Users can get the
legacy behavior (spool-to-file/rely-on-underlying-file-in-TikaInputStream)
- by setting "spoolToFile" to true and using a TikaInputStream (TIKA-2849).
+ by setting markLimit=-1. The POIFSContainerDetector requires an underlying file;
+ it will try to spool the file to disk; if the file's length is > markLimit,
+ it will not attempt detection; set markLimit to -1 for legacy behavior (TIKA-2849).
* Upgrade PDFBox to 2.0.14 (TIKA-2834).
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 96f922f..855ab28 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -623,14 +623,45 @@ public class TikaInputStream extends TaggedInputStream {
return path != null;
}
+
+ /**
+ * If the user created this TikaInputStream with a file,
+ * the original file will be returned. If not, the entire stream
+ * will be spooled to a temporary file which will be deleted
+ * upon the close of this TikaInputStream
+ * @return
+ * @throws IOException
+ */
public Path getPath() throws IOException {
+ return getPath(-1);
+ }
+
+ /**
+ *
+ * @param maxBytes if this is less than 0 and if an underlying file doesn't already exist,
+ * the full file will be spooled to disk
+ * @return the original path used in the initialization of this TikaInputStream,
+ * a temporary file if the stream was shorter than <code>maxBytes</code>, or <code>null</code>
+ * if the underlying stream was longer than maxBytes.
+ * @throws IOException
+ */
+ public Path getPath(int maxBytes) throws IOException {
if (path == null) {
if (position > 0) {
throw new IOException("Stream is already being read");
} else {
- // Spool the entire stream into a temporary file
path = tmp.createTempFile();
- Files.copy(in, path, REPLACE_EXISTING);
+ if (maxBytes > -1) {
+ try (InputStream lookAhead = new LookaheadInputStream(in, maxBytes)) {
+ Files.copy(lookAhead, path, REPLACE_EXISTING);
+ if (Files.size(path) >= maxBytes) {
+ return null;
+ }
+ }
+ } else {
+ // Spool the entire stream into a temporary file
+ Files.copy(in, path, REPLACE_EXISTING);
+ }
// Create a new input stream and make sure it'll get closed
InputStream newStream = Files.newInputStream(path);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
index 6f32984..576cf52 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -16,13 +16,13 @@
*/
package org.apache.tika.parser.microsoft;
-import static org.apache.tika.mime.MediaType.OCTET_STREAM;
import static org.apache.tika.mime.MediaType.application;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
@@ -35,7 +35,9 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.DocumentNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.config.Field;
import org.apache.tika.detect.Detector;
+import org.apache.tika.io.LookaheadInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -175,6 +177,13 @@ public class POIFSContainerDetector implements Detector {
*/
private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
+ @Field
+ private int markLimit = 16 * 1024 * 1024;
+
+ public void setMarkLimit(int markLimit) {
+ this.markLimit = markLimit;
+ }
+
/**
* Internal detection of the specific kind of OLE2 document, based on the
* names of the top level streams within the file.
@@ -379,14 +388,19 @@ public class POIFSContainerDetector implements Detector {
return false;
}
- private static Set<String> getTopLevelNames(TikaInputStream stream)
+ private Set<String> getTopLevelNames(TikaInputStream stream)
throws IOException {
// Force the document stream to a (possibly temporary) file
// so we don't modify the current position of the stream
- File file = stream.getFile();
+ Path file = stream.getPath(markLimit);
+
+ //if the stream was longer than markLimit, don't detect
+ if (file == null) {
+ return Collections.emptySet();
+ }
try {
- POIFSFileSystem fs = new POIFSFileSystem(file, true);
+ POIFSFileSystem fs = new POIFSFileSystem(file.toFile(), true);
// Optimize a possible later parsing process by keeping
// a reference to the already opened POI file system
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index d9770f7..7007b7c6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -46,6 +46,7 @@ import org.apache.poi.openxml4j.util.ZipFileZipEntrySource;
import org.apache.tika.config.Field;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.BoundedInputStream;
+import org.apache.tika.io.LookaheadInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -90,10 +91,7 @@ public class ZipContainerDetector implements Detector {
//this has to be > 100,000 to handle some of the iworks files
//in our unit tests
@Field
- int markLimit = 500_000;
-
- @Field
- boolean spoolToFile = false;
+ int markLimit = 16 * 1024 * 1024;
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
@@ -119,7 +117,7 @@ public class ZipContainerDetector implements Detector {
if (TikaInputStream.isTikaInputStream(input)) {
TikaInputStream tis = TikaInputStream.cast(input);
- if (spoolToFile) {
+ if (markLimit < 0) {
tis.getFile();
}
if (tis.hasFile()) {
@@ -127,12 +125,8 @@ public class ZipContainerDetector implements Detector {
}
}
- input.mark(markLimit);
- try {
- return StreamingZipContainerDetector.detect(
- new BoundedInputStream(markLimit, input));
- } finally {
- input.reset();
+ try (LookaheadInputStream lookahead = new LookaheadInputStream(input, markLimit)) {
+ return StreamingZipContainerDetector.detect(lookahead);
}
} else if (!type.equals(MediaType.OCTET_STREAM)) {
return type;
@@ -142,6 +136,10 @@ public class ZipContainerDetector implements Detector {
}
/**
+ * If this is less than 0, the file will be spooled to disk,
+ * and detection will run on the full file.
+ * If this is greater than 0, the {@link StreamingZipContainerDetector}
+ * will be called only up to the markLimit.
*
* @param markLimit mark limit for streaming detection
*/
@@ -149,18 +147,6 @@ public class ZipContainerDetector implements Detector {
this.markLimit = markLimit;
}
- /**
- * Before version 1.21, if a user passed in a {@link TikaInputStream},
- * the entire stream would be spooled to a file for Zip detection.
- * With Tika 1.21, the default is now <code>false</code>, which means
- * that the {@link ZipContainerDetector} will try streaming detection
- * up to the {@link ZipContainerDetector#markLimit} on all streams.
- * To revert to the legacy (pre 1.21) behavior, set this to <code>true</code>
- * @param spoolToFile
- */
- public void setSpoolToFile(boolean spoolToFile) {
- this.spoolToFile = spoolToFile;
- }
private static MediaType detectCompressorFormat(byte[] prefix, int length) {
try {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
index f4f5059..8c92986 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
@@ -110,6 +110,7 @@ public class ZipContainerDetectorTest extends TikaTest {
}
}
+ @Ignore("for offline testing")
@Test
public void timeDetection() throws Exception {
TikaConfig config = TikaConfig.getDefaultConfig();
@@ -129,19 +130,16 @@ public class ZipContainerDetectorTest extends TikaTest {
mediaTypeSet.add(mt);
}
nonTikaStream += System.currentTimeMillis()-start;
- }
- for (File z : zips) {
- long start = System.currentTimeMillis();
+
+ start = System.currentTimeMillis();
try (InputStream is = TikaInputStream.get(
new BufferedInputStream(new FileInputStream(z)))) {
MediaType mt = detector.detect(is, new Metadata());
mediaTypeSet.add(mt);
}
tikaStream += System.currentTimeMillis()-start;
- }
- for (File z : zips) {
- long start = System.currentTimeMillis();
+ start = System.currentTimeMillis();
try (InputStream is = TikaInputStream.get(z)) {
MediaType mt = detector.detect(is, new Metadata());
mediaTypeSet.add(mt);
@@ -171,10 +169,7 @@ public class ZipContainerDetectorTest extends TikaTest {
getRecursiveMetadata(is, true);
}
stream += System.currentTimeMillis()-start;
- }
-
- for (File z : zips) {
- long start = System.currentTimeMillis();
+ start = System.currentTimeMillis();
try (InputStream is = TikaInputStream.get(z)) {
getRecursiveMetadata(is, true);
}