You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/06 21:07:55 UTC
[tika] 02/04: TIKA-3061 -- need to read inputstream completely
before processing in StreamingZipContainerDetector
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8a2cf1ec3e8f39ba40759db0d30343b178ebfede
Author: tallison <ta...@apache.org>
AuthorDate: Fri Mar 6 11:35:51 2020 -0500
TIKA-3061 -- need to read inputstream completely before processing in StreamingZipContainerDetector
---
tika-core/src/test/java/org/apache/tika/TikaTest.java | 5 +++++
.../tika/parser/pkg/StreamingZipContainerDetector.java | 10 +++++++++-
.../apache/tika/detect/TestContainerAwareDetector.java | 9 +++++++++
.../resources/test-documents/testOpenOfficeInAZip.zip | Bin 0 -> 10706 bytes
4 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index cbd8f1c..ac1ef3c 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -255,6 +255,11 @@ public abstract class TikaTest {
return getRecursiveMetadata(tis, new ParseContext(), new Metadata(), suppressException);
}
}
+ protected List<Metadata> getRecursiveMetadata(Path filePath) throws Exception {
+ try (TikaInputStream tis = TikaInputStream.get(filePath)) {
+ return getRecursiveMetadata(tis, true);
+ }
+ }
protected List<Metadata> getRecursiveMetadata(InputStream is, boolean suppressException) throws Exception {
return getRecursiveMetadata(is, new ParseContext(), new Metadata(), suppressException);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
index b55ed1a..a39f9f4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.pkg;
import static java.nio.charset.StandardCharsets.UTF_8;
+import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.HashSet;
@@ -139,7 +140,13 @@ public class StreamingZipContainerDetector extends ZipContainerDetectorBase impl
return MediaType.parse(new String(bos.toByteArray(), UTF_8));
}
} else if (name.equals("META-INF/manifest.xml")) {
- MediaType mt = detectStarOfficeX(zipArchiveInputStream);
+ //for an unknown reason, passing in the zipArchiveInputStream
+ //"as is" can cause the iteration of the entries to stop early
+ //without exception or warning. So, copy the full stream, then
+ //process. TIKA-3061
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(zipArchiveInputStream, bos);
+ MediaType mt = detectStarOfficeX(new ByteArrayInputStream(bos.toByteArray()));
if (mt != null) {
return mt;
}
@@ -157,6 +164,7 @@ public class StreamingZipContainerDetector extends ZipContainerDetectorBase impl
} catch (SecurityException e) {
throw e;
} catch (Exception e) {
+ e.printStackTrace();
//swallow
}
//entrynames is the union of directory names and file names
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index ed27456..2fa274a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -25,6 +25,7 @@ import java.io.FileFilter;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
+import java.util.List;
import java.util.Random;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -554,4 +555,12 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
testDetector(detector, numThreads, 50, filter, numThreads*3);
}
+ @Test
+ public void testOpenOfficeInAZip() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata(
+ "testOpenOfficeInAZip.zip");
+ assertEquals(3, metadataList.size());
+ assertEquals("application/vnd.oasis.opendocument.presentation",
+ metadataList.get(2).get(Metadata.CONTENT_TYPE));
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testOpenOfficeInAZip.zip b/tika-parsers/src/test/resources/test-documents/testOpenOfficeInAZip.zip
new file mode 100644
index 0000000..2546048
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOpenOfficeInAZip.zip differ