You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/09/25 15:15:19 UTC
[tika] branch main updated: TIKA-3196 -- ensure that entryCnt is
thread-safe across parses; add integration test;
clean up existing unused imports.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 6591b32 TIKA-3196 -- ensure that entryCnt is thread-safe across parses; add integration test; clean up existing unused imports.
6591b32 is described below
commit 6591b32fd6e60e29b505e78e8fb8529be337ef75
Author: tallison <ta...@apache.org>
AuthorDate: Fri Sep 25 11:15:00 2020 -0400
TIKA-3196 -- ensure that entryCnt is thread-safe across parses; add integration test; clean up existing unused imports.
---
.../org/apache/tika/parser/pkg/ZipParserTest.java | 19 ++++++++++++-
.../org/apache/tika/parser/pkg/PackageParser.java | 30 +++++++++++----------
.../org/apache/tika/parser/pkg/ZipParserTest.java | 8 ------
.../testZip_with_DataDescriptor2.zip | Bin 0 -> 1987 bytes
4 files changed, 34 insertions(+), 23 deletions(-)
diff --git a/tika-parser-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parser-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 3566f49..ba383fd 100644
--- a/tika-parser-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parser-modules/tika-parser-integration-tests/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -32,7 +32,6 @@ import java.util.Set;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
/**
@@ -133,4 +132,22 @@ public class ZipParserTest extends AbstractPkgTest {
//the original file name
getRecursiveMetadata("droste.zip");
}
+
+ @Test
+ public void testDataDescriptorWithEmptyEntry() throws Exception {
+
+ //test that an empty first entry does not cause problems
+ List<Metadata> results = getRecursiveMetadata("testZip_with_DataDescriptor2.zip");
+ assertEquals(5, results.size());
+
+ //mime is 0 bytes
+ assertContains("InputStream must have > 0 bytes",
+ results.get(1).get("X-TIKA:EXCEPTION:embedded_exception"));
+ //source.xml is binary, not xml
+ assertContains("TikaException: XML parse error",
+ results.get(2).get("X-TIKA:EXCEPTION:embedded_exception"));
+ //manifest.xml has malformed xml
+ assertContains("TikaException: XML parse error",
+ results.get(4).get("X-TIKA:EXCEPTION:embedded_exception"));
+ }
}
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index b099aff..0ad3c6b 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -24,6 +24,7 @@ import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.ZipEntry;
import org.apache.commons.compress.PasswordRequiredException;
@@ -105,8 +106,6 @@ public class PackageParser extends AbstractParser {
// the mark limit used for stream
private static final int MARK_LIMIT = 100 * 1024 * 1024; // 100M
- // count of the entries in the archive, this is used for zip requires Data Descriptor
- private int entryCnt = 0;
static final Set<MediaType> loadPackageSpecializations() {
Set<MediaType> zipSpecializations = new HashSet<>();
@@ -273,9 +272,11 @@ public class PackageParser extends AbstractParser {
// mark before we start parsing entries for potential reset
stream.mark(MARK_LIMIT);
- entryCnt = 0;
+ //needed for mutable int by ref, not for thread safety.
+ //this keeps track of how many entries were processed.
+ AtomicInteger entryCnt = new AtomicInteger();
try {
- parseEntries(false, ais, metadata, extractor, xhtml);
+ parseEntries(ais, metadata, extractor, xhtml, false, entryCnt);
} catch (UnsupportedZipFeatureException zfe) {
// If this is a zip archive which requires a data descriptor, parse it again
if (zfe.getFeature() == Feature.DATA_DESCRIPTOR) {
@@ -283,14 +284,13 @@ public class PackageParser extends AbstractParser {
ais.close();
// An exception would be thrown if MARK_LIMIT is not big enough
stream.reset();
- ais = new ZipArchiveInputStream(new CloseShieldInputStream(stream), encoding, true, true);
- parseEntries(true, ais, metadata, extractor, xhtml);
+ ais = new ZipArchiveInputStream(new CloseShieldInputStream(stream), encoding,
+ true, true);
+ parseEntries(ais, metadata, extractor, xhtml, true, entryCnt);
}
} finally {
ais.close();
tmp.close();
- // reset the entryCnt
- entryCnt = 0;
}
xhtml.endDocument();
@@ -299,27 +299,29 @@ public class PackageParser extends AbstractParser {
/**
* Parse the entries of the zip archive
*
- * @param shouldUseDataDescriptor indicates if a data descriptor is required or not
* @param ais archive input stream
* @param metadata document metadata (input and output)
* @param extractor the delegate parser
* @param xhtml the xhtml handler
+ * @param shouldUseDataDescriptor indicates if a data descriptor is required or not
+ * @param entryCnt index of the entry
* @throws TikaException if the document could not be parsed
* @throws IOException if a UnsupportedZipFeatureException is met
* @throws SAXException if the SAX events could not be processed
*/
- private void parseEntries(boolean shouldUseDataDescriptor, ArchiveInputStream ais, Metadata metadata,
- EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml)
+ private void parseEntries(ArchiveInputStream ais, Metadata metadata,
+ EmbeddedDocumentExtractor extractor, XHTMLContentHandler xhtml,
+ boolean shouldUseDataDescriptor, AtomicInteger entryCnt)
throws TikaException, IOException, SAXException {
try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
- if (shouldUseDataDescriptor && entryCnt > 0) {
+ if (shouldUseDataDescriptor && entryCnt.get() > 0) {
// With shouldUseDataDescriptor being true, we are reading
// the zip once again. The number of entryCnt entries have
// already been parsed in the last time, so we can just
// skip these entries.
- entryCnt--;
+ entryCnt.decrementAndGet();
entry = ais.getNextEntry();
continue;
}
@@ -332,7 +334,7 @@ public class PackageParser extends AbstractParser {
// Record the number of entries we have read, this is used
// for zip archives using Data Descriptor. It's used for
// skipping the entries we have already read
- entryCnt++;
+ entryCnt.incrementAndGet();
}
entry = ais.getNextEntry();
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 29ff990..0990ecd 100644
--- a/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parser-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -21,18 +21,10 @@ import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
import java.io.InputStream;
-import java.util.Arrays;
-import java.util.zip.ZipEntry;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
diff --git a/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testZip_with_DataDescriptor2.zip b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testZip_with_DataDescriptor2.zip
new file mode 100644
index 0000000..c698637
Binary files /dev/null and b/tika-parser-modules/tika-parser-pkg-module/src/test/resources/test-documents/testZip_with_DataDescriptor2.zip differ