You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/09/25 15:30:23 UTC
[tika] branch branch_1x updated: Fix TIKA-3196 (#364)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 9736af8  Fix TIKA-3196 (#364)
9736af8 is described below

commit 9736af8d8df86cd974eaaa7b27e566af83cfb6c4
Author: Lee <55...@users.noreply.github.com>
AuthorDate: Fri Sep 25 22:04:54 2020 +0800

    Fix TIKA-3196 (#364)
    
    When reading a zip archive entry with STORED and Data Descriptor, a
    UnsupportedZipFeatureException is thrown. We can save the number of
    entries we have already read, reset the stream, and open the
    ZipArchieInputStream again with Data Descriptor allowed. Then we can
    finish reading the rest of the entries.
    # Conflicts:
    #	tika-parsers/src/test/resources/test-documents/testZip_with_DataDescriptor.zip
---
 .../org/apache/tika/parser/pkg/PackageParser.java  |  87 +++++++++++++++++++--
 .../org/apache/tika/parser/pkg/ZipParserTest.java  |  39 +++++++++
 .../test-documents/testZip_with_DataDescriptor.zip | Bin 0 -> 484 bytes
 .../testZip_with_DataDescriptor2.zip               | Bin 0 -> 1987 bytes
 4 files changed, 120 insertions(+), 6 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index ef112c9..6f6db03 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -24,6 +24,7 @@ import java.util.Collections;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.zip.ZipEntry;
 
 import org.apache.commons.compress.PasswordRequiredException;
 import org.apache.commons.compress.archivers.ArchiveEntry;
@@ -105,6 +106,12 @@ public class PackageParser extends AbstractParser {
     static final Set<MediaType> PACKAGE_SPECIALIZATIONS =
             loadPackageSpecializations();
 
+    // the mark limit used for stream
+    private static final int MARK_LIMIT = 100 * 1024 * 1024; // 100M
+
+    // count of the entries in the archive, this is used for zip requires Data Descriptor
+    private int entryCnt = 0;
+
     static final Set<MediaType> loadPackageSpecializations() {
         Set<MediaType> zipSpecializations = new HashSet<>();
         for (String mediaTypeString : new String[]{
@@ -238,8 +245,10 @@ public class PackageParser extends AbstractParser {
         
         TemporaryResources tmp = new TemporaryResources();
         ArchiveInputStream ais = null;
+        String encoding = null;
         try {
             ArchiveStreamFactory factory = context.get(ArchiveStreamFactory.class, new ArchiveStreamFactory());
+            encoding = factory.getEntryEncoding();
             // At the end we want to close the archive stream to release
             // any associated resources, but the underlying document stream
             // should not be closed
@@ -289,29 +298,87 @@ public class PackageParser extends AbstractParser {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
 
+        // mark before we start parsing entries for potential reset
+        stream.mark(MARK_LIMIT);
+        entryCnt = 0;
+        try {
+            parseEntries(false, ais, metadata, extractor, xhtml);
+        } catch (UnsupportedZipFeatureException zfe) {
+            // If this is a zip archive which requires a data descriptor, parse it again
+            if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) {
+                // Close archive input stream and create a new one that could handle data descriptor
+                ais.close();
+                // An exception would be thrown if MARK_LIMIT is not big enough
+                stream.reset();
+                ais = new ZipArchiveInputStream(new CloseShieldInputStream(stream), encoding, true, true);
+                parseEntries(true, ais, metadata, extractor, xhtml);
+            }
+        } finally {
+            ais.close();
+            tmp.close();
+            // reset the entryCnt
+            entryCnt = 0;
+        }
+
+        xhtml.endDocument();
+    }
+
+    /**
+     * Parse the entries of the zip archive
+     *
+     * @param shouldUseDataDescriptor indicates if a data descriptor is required or not
+     * @param ais archive input stream
+     * @param metadata document metadata (input and output)
+     * @param extractor the delegate parser
+     * @param xhtml the xhtml handler
+     * @throws TikaException if the document could not be parsed
+     * @throws IOException if a UnsupportedZipFeatureException is met
+     * @throws SAXException if the SAX events could not be processed
+     */
+    private void parseEntries(boolean shouldUseDataDescriptor, ArchiveInputStream ais, Metadata metadata,
+                              EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
+            throws TikaException, IOException, SAXException {
         try {
             ArchiveEntry entry = ais.getNextEntry();
             while (entry != null) {
+                if (shouldUseDataDescriptor && entryCnt > 0) {
+                    // With shouldUseDataDescriptor being true, we are reading
+                    // the zip once again. The number of entryCnt entries have
+                    // already been parsed in the last time, so we can just
+                    // skip these entries.
+                    entryCnt--;
+                    entry = ais.getNextEntry();
+                    continue;
+                }
+
                 if (!entry.isDirectory()) {
                     parseEntry(ais, entry, extractor, metadata, xhtml);
                 }
+
+                if (!shouldUseDataDescriptor) {
+                    // Record the number of entries we have read, this is used
+                    // for zip archives using Data Descriptor. It's used for
+                    // skipping the entries we have already read
+                    entryCnt++;
+                }
+
                 entry = ais.getNextEntry();
             }
         } catch (UnsupportedZipFeatureException zfe) {
+
             // If it's an encrypted document of unknown password, report as such
             if (zfe.getFeature() == Feature.ENCRYPTION) {
                 throw new EncryptedDocumentException(zfe);
             }
+
+            if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) {
+                throw zfe;
+            }
             // Otherwise throw the exception
             throw new TikaException("UnsupportedZipFeature", zfe);
         } catch (PasswordRequiredException pre) {
             throw new EncryptedDocumentException(pre);
-        } finally {
-            ais.close();
-            tmp.close();
         }
-
-        xhtml.endDocument();
     }
 
     private void updateMediaType(ArchiveInputStream ais, Metadata metadata) {
@@ -364,11 +431,19 @@ public class PackageParser extends AbstractParser {
         } else {
             name = (name == null) ? "" : name;
             if (entry instanceof ZipArchiveEntry) {
-                boolean usesEncryption = ((ZipArchiveEntry) entry).getGeneralPurposeBit().usesEncryption();
+                ZipArchiveEntry zipArchiveEntry = (ZipArchiveEntry) entry;
+                boolean usesEncryption = zipArchiveEntry.getGeneralPurposeBit().usesEncryption();
                 if (usesEncryption) {
                     EmbeddedDocumentUtil.recordEmbeddedStreamException(
                             new EncryptedDocumentException("stream ("+name+") is encrypted"), parentMetadata);
                 }
+
+                // do not write to the handler if UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR
+                // is met, we will catch this exception and read the zip archive once again
+                boolean usesDataDescriptor = zipArchiveEntry.getGeneralPurposeBit().usesDataDescriptor();
+                if (usesDataDescriptor && zipArchiveEntry.getMethod() == ZipEntry.STORED) {
+                    throw new UnsupportedZipFeatureException(UnsupportedZipFeatureException.Feature.DATA_DESCRIPTOR, zipArchiveEntry);
+                }
             } else {
                 EmbeddedDocumentUtil.recordEmbeddedStreamException(
                         new TikaException("Can't read archive stream ("+name+")"), parentMetadata);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 93f0109..a843491 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -237,4 +237,43 @@ public class ZipParserTest extends AbstractPkgTest {
         getXML("droste.zip");
     }
 
+    @Test
+    public void testZipUsingStoredWithDataDescriptor() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = ZipParserTest.class.getResourceAsStream(
+                "/test-documents/testZip_with_DataDescriptor.zip")) {
+            AUTO_DETECT_PARSER.parse(stream, handler, metadata, trackingContext);
+
+            assertEquals(5, tracker.filenames.size());
+            assertEquals("en0", tracker.filenames.get(0));
+            assertEquals("en1", tracker.filenames.get(1));
+            assertEquals("en2", tracker.filenames.get(2));
+            assertEquals("en3", tracker.filenames.get(3));
+            assertEquals("en4", tracker.filenames.get(4));
+            assertEquals(1, tracker.lastSeenStart[0]);
+            assertEquals(2, tracker.lastSeenStart[1]);
+            assertEquals(3, tracker.lastSeenStart[2]);
+            assertEquals(4, tracker.lastSeenStart[3]);
+        }
+    }
+
+    @Test
+    public void testDataDescriptorWithEmptyEntry() throws Exception {
+
+        //test that an empty first entry does not cause problems
+        List<Metadata> results = getRecursiveMetadata("testZip_with_DataDescriptor2.zip");
+        assertEquals(5, results.size());
+
+        //mime is 0 bytes
+        assertContains("InputStream must have > 0 bytes",
+                results.get(1).get("X-TIKA:EXCEPTION:embedded_exception"));
+        //source.xml is binary, not xml
+        assertContains("TikaException: XML parse error",
+                results.get(2).get("X-TIKA:EXCEPTION:embedded_exception"));
+        //manifest.xml has malformed xml
+        assertContains("TikaException: XML parse error",
+                results.get(4).get("X-TIKA:EXCEPTION:embedded_exception"));
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testZip_with_DataDescriptor.zip b/tika-parsers/src/test/resources/test-documents/testZip_with_DataDescriptor.zip
new file mode 100644
index 0000000..1f06197
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testZip_with_DataDescriptor.zip differ
diff --git a/tika-parsers/src/test/resources/test-documents/testZip_with_DataDescriptor2.zip b/tika-parsers/src/test/resources/test-documents/testZip_with_DataDescriptor2.zip
new file mode 100644
index 0000000..c698637
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testZip_with_DataDescriptor2.zip differ