You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/09/25 15:15:19 UTC

[tika] branch main updated: TIKA-3196 -- ensure that entryCnt is thread-safe across parses; add integration test; clean up existing unused imports.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 6591b32  TIKA-3196 -- ensure that entryCnt is thread-safe across parses; add integration test; clean up existing unused imports.
6591b32 is described below

commit 6591b32fd6e60e29b505e78e8fb8529be337ef75
Author: tallison <ta...@apache.org>
AuthorDate: Fri Sep 25 11:15:00 2020 -0400

    TIKA-3196 -- ensure that entryCnt is thread-safe across parses; add integration test; clean up existing unused imports.
---
 .../org/apache/tika/parser/pkg/ZipParserTest.java  |  19 ++++++++++++-
 .../org/apache/tika/parser/pkg/PackageParser.java  |  30 +++++++++++----------
 .../org/apache/tika/parser/pkg/ZipParserTest.java  |   8 ------
 .../testZip_with_DataDescriptor2.zip               | Bin 0 -> 1987 bytes
 4 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/tika-parser-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parser-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 3566f49..ba383fd 100644
--- a/tika-parser-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parser-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -32,7 +32,6 @@ import java.util.Set;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
 /**
@@ -133,4 +132,22 @@ public class ZipParserTest extends AbstractPkgTest {
         //the original file name
         getRecursiveMetadata("droste.zip");
     }
+
+    @Test
+    public void testDataDescriptorWithEmptyEntry() throws Exception {
+
+        //test that an empty first entry does not cause problems
+        List<Metadata> results = getRecursiveMetadata("testZip_with_DataDescriptor2.zip");
+        assertEquals(5, results.size());
+
+        //mime is 0 bytes
+        assertContains("InputStream must have > 0 bytes",
+                results.get(1).get("X-TIKA:EXCEPTION:embedded_exception"));
+        //source.xml is binary, not xml
+        assertContains("TikaException: XML parse error",
+                results.get(2).get("X-TIKA:EXCEPTION:embedded_exception"));
+        //manifest.xml has malformed xml
+        assertContains("TikaException: XML parse error",
+                results.get(4).get("X-TIKA:EXCEPTION:embedded_exception"));
+    }
 }
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index b099aff..0ad3c6b 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -24,6 +24,7 @@ import java.util.Collections;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.zip.ZipEntry;
 
 import org.apache.commons.compress.PasswordRequiredException;
@@ -105,8 +106,6 @@ public class PackageParser extends AbstractParser {
     // the mark limit used for stream
     private static final int MARK_LIMIT = 100 * 1024 * 1024; // 100M
 
-    // count of the entries in the archive, this is used for zip requires Data Descriptor
-    private int entryCnt = 0;
 
     static final Set<MediaType> loadPackageSpecializations() {
         Set<MediaType> zipSpecializations = new HashSet<>();
@@ -273,9 +272,11 @@ public class PackageParser extends AbstractParser {
 
         // mark before we start parsing entries for potential reset
         stream.mark(MARK_LIMIT);
-        entryCnt = 0;
+        //needed for mutable int by ref, not for thread safety.
+        //this keeps track of how many entries were processed.
+        AtomicInteger entryCnt = new AtomicInteger();
         try {
-            parseEntries(false, ais, metadata, extractor, xhtml);
+            parseEntries(ais, metadata, extractor, xhtml, false, entryCnt);
         } catch (UnsupportedZipFeatureException zfe) {
             // If this is a zip archive which requires a data descriptor, parse it again
             if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) {
@@ -283,14 +284,13 @@ public class PackageParser extends AbstractParser {
                 ais.close();
                 // An exception would be thrown if MARK_LIMIT is not big enough
                 stream.reset();
-                ais = new ZipArchiveInputStream(new CloseShieldInputStream(stream), encoding, true, true);
-                parseEntries(true, ais, metadata, extractor, xhtml);
+                ais = new ZipArchiveInputStream(new CloseShieldInputStream(stream), encoding,
+                        true, true);
+                parseEntries(ais, metadata, extractor, xhtml, true, entryCnt);
             }
         } finally {
             ais.close();
             tmp.close();
-            // reset the entryCnt
-            entryCnt = 0;
         }
 
         xhtml.endDocument();
@@ -299,27 +299,29 @@ public class PackageParser extends AbstractParser {
     /**
      * Parse the entries of the zip archive
      *
-     * @param shouldUseDataDescriptor indicates if a data descriptor is required or not
      * @param ais archive input stream
      * @param metadata document metadata (input and output)
      * @param extractor the delegate parser
      * @param xhtml the xhtml handler
+     * @param shouldUseDataDescriptor indicates if a data descriptor is required or not
+     * @param entryCnt index of the entry
      * @throws TikaException if the document could not be parsed
      * @throws IOException if a UnsupportedZipFeatureException is met
      * @throws SAXException if the SAX events could not be processed
      */
-    private void parseEntries(boolean shouldUseDataDescriptor, ArchiveInputStream ais, Metadata metadata,
-                              EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
+    private void parseEntries(ArchiveInputStream ais, Metadata metadata,
+                              EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml,
+                              boolean shouldUseDataDescriptor, AtomicInteger entryCnt)
             throws TikaException, IOException, SAXException {
         try {
             ArchiveEntry entry = ais.getNextEntry();
             while (entry != null) {
-                if (shouldUseDataDescriptor && entryCnt > 0) {
+                if (shouldUseDataDescriptor && entryCnt.get() > 0) {
                     // With shouldUseDataDescriptor being true, we are reading
                     // the zip once again. The number of entryCnt entries have
                     // already been parsed in the last time, so we can just
                     // skip these entries.
-                    entryCnt--;
+                    entryCnt.decrementAndGet();
                     entry = ais.getNextEntry();
                     continue;
                 }
@@ -332,7 +334,7 @@ public class PackageParser extends AbstractParser {
                     // Record the number of entries we have read, this is used
                     // for zip archives using Data Descriptor. It's used for
                     // skipping the entries we have already read
-                    entryCnt++;
+                    entryCnt.incrementAndGet();
                 }
 
                 entry = ais.getNextEntry();
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 29ff990..0990ecd 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -21,18 +21,10 @@ import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
 import java.io.InputStream;
-import java.util.Arrays;
-import java.util.zip.ZipEntry;
 
 import org.apache.commons.codec.binary.Base64;
 import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testZip_with_DataDescriptor2.zip b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testZip_with_DataDescriptor2.zip
new file mode 100644
index 0000000..c698637
Binary files /dev/null and b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testZip_with_DataDescriptor2.zip differ