You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/03/06 21:07:55 UTC

[tika] 02/04: TIKA-3061 -- need to read inputstream completely before processing in StreamingZipContainerDetector

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 8a2cf1ec3e8f39ba40759db0d30343b178ebfede
Author: tallison <ta...@apache.org>
AuthorDate: Fri Mar 6 11:35:51 2020 -0500

    TIKA-3061 -- need to read inputstream completely before processing in StreamingZipContainerDetector
---
 tika-core/src/test/java/org/apache/tika/TikaTest.java   |   5 +++++
 .../tika/parser/pkg/StreamingZipContainerDetector.java  |  10 +++++++++-
 .../apache/tika/detect/TestContainerAwareDetector.java  |   9 +++++++++
 .../resources/test-documents/testOpenOfficeInAZip.zip   | Bin 0 -> 10706 bytes
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index cbd8f1c..ac1ef3c 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -255,6 +255,11 @@ public abstract class TikaTest {
             return getRecursiveMetadata(tis, new ParseContext(), new Metadata(), suppressException);
         }
     }
+    protected List<Metadata> getRecursiveMetadata(Path filePath) throws Exception {
+        try (TikaInputStream tis = TikaInputStream.get(filePath)) {
+            return getRecursiveMetadata(tis, true);
+        }
+    }
 
     protected List<Metadata> getRecursiveMetadata(InputStream is, boolean suppressException) throws Exception {
         return getRecursiveMetadata(is, new ParseContext(), new Metadata(), suppressException);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
index b55ed1a..a39f9f4 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.pkg;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 
+import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.util.HashSet;
@@ -139,7 +140,13 @@ public class StreamingZipContainerDetector extends ZipContainerDetectorBase impl
                         return MediaType.parse(new String(bos.toByteArray(), UTF_8));
                     }
                 } else if (name.equals("META-INF/manifest.xml")) {
-                    MediaType mt = detectStarOfficeX(zipArchiveInputStream);
+                    //for an unknown reason, passing in the zipArchiveInputStream
+                    //"as is" can cause the iteration of the entries to stop early
+                    //without exception or warning.  So, copy the full stream, then
+                    //process.  TIKA-3061
+                    ByteArrayOutputStream bos = new ByteArrayOutputStream();
+                    IOUtils.copy(zipArchiveInputStream, bos);
+                    MediaType mt = detectStarOfficeX(new ByteArrayInputStream(bos.toByteArray()));
                     if (mt != null) {
                         return mt;
                     }
@@ -157,6 +164,7 @@ public class StreamingZipContainerDetector extends ZipContainerDetectorBase impl
         } catch (SecurityException e) {
             throw e;
         } catch (Exception e) {
+            e.printStackTrace();
             //swallow
         }
         //entrynames is the union of directory names and file names
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index ed27456..2fa274a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -25,6 +25,7 @@ import java.io.FileFilter;
 import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.List;
 import java.util.Random;
 
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -554,4 +555,12 @@ public class TestContainerAwareDetector extends MultiThreadedTikaTest {
         testDetector(detector, numThreads, 50, filter, numThreads*3);
     }
 
+    @Test
+    public void testOpenOfficeInAZip() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata(
+                "testOpenOfficeInAZip.zip");
+        assertEquals(3, metadataList.size());
+        assertEquals("application/vnd.oasis.opendocument.presentation",
+                metadataList.get(2).get(Metadata.CONTENT_TYPE));
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testOpenOfficeInAZip.zip b/tika-parsers/src/test/resources/test-documents/testOpenOfficeInAZip.zip
new file mode 100644
index 0000000..2546048
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testOpenOfficeInAZip.zip differ