You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/16 17:27:58 UTC
[tika] 01/01: TIKA-4048 -- revert change in default decompressConcatenated and add a gzip subtype detector for warc+gz
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4048
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 46832d718eb1572d3cca4d7d18415afff336d4c6
Author: tallison <ta...@apache.org>
AuthorDate: Wed Aug 16 13:27:43 2023 -0400
TIKA-4048 -- revert change in default decompressConcatenated and add a gzip subtype detector for warc+gz
---
CHANGES.txt | 3 +-
.../org/apache/tika/mime/tika-mimetypes.xml | 6 ++
.../src/test/java/org/apache/tika/TikaTest.java | 63 +++++++++++++++
.../detect/gzip/GZipSpecializationDetector.java | 90 +++++++++++++++++++++
.../apache/tika/parser/pkg/CompressorParser.java | 2 +-
.../services/org.apache.tika.detect.Detector | 16 ++++
.../tika-parser-webarchive-module/pom.xml | 8 +-
.../org/apache/tika/parser/warc/WARCParser.java | 5 +-
.../apache/tika/parser/warc/WARCParserTest.java | 23 +++++-
.../test-documents/testWARC_multiple.warc | Bin 0 -> 6773 bytes
.../test-documents/testWARC_multiple.warc.gz | Bin 0 -> 5907 bytes
.../apache/tika/detect/TestDetectorLoading.java | 8 +-
.../org/apache/tika/parser/pkg/GzipParserTest.java | 12 ++-
.../resources/configs/tika-config-multiple-gz.xml | 29 +++++++
14 files changed, 253 insertions(+), 12 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 55bd83671..015a55e43 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,8 +7,7 @@ Release 2.8.1 - ???
* Fix bug that led to duplicate extraction of macros from some OLE2 containers (TIKA-4116).
- * Changed default decompressConcatenated to true in CompressorParser.
- Users may revert to legacy behavior via tika-config.xml (TIKA-4048).
+ * Add detection of warc.gz as a specialization of gz and parse as if a standard WARC (TIKA-4048).
* Allow users to modify the attachment limit size in the /unpack resource (TIKA-4039)
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 362ace8c1..47203a163 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3212,6 +3212,12 @@
<glob pattern="*.warc"/>
</mime-type>
+ <mime-type type="application/warc+gz">
+ <acronym>WARC</acronym>
+ <_comment>WARC</_comment>
+ <glob pattern="*.warc.gz"/>
+ </mime-type>
+
<mime-type type="application/wasm">
<acronym>Wasm</acronym>
<_comment>Web Assembly</_comment>
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index a00d7b2b0..c20229b59 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika;
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -101,6 +102,33 @@ public abstract class TikaTest {
assertFalse(haystack.contains(needle), needle + " unexpectedly found in:\n" + haystack);
}
+ public static void assertMetadataListEquals(List<Metadata> metadataListA,
+ List<Metadata> metadataListB,
+ Set<String> fieldsToIgnore) {
+ assertEquals(metadataListA.size(), metadataListB.size(), "different sizes");
+ for (int i = 0; i < metadataListA.size(); i++) {
+ Metadata mA = metadataListA.get(i);
+ Metadata mB = metadataListB.get(i);
+ Set<String> mAFields = new HashSet<>();
+ for (String n : mA.names()) {
+ if (fieldsToIgnore.contains(n)) {
+ continue;
+ }
+ mAFields.add(n);
+ assertArrayEquals(mA.getValues(n), mB.getValues(n), "problem with " + n +
+ " in metadata index=" + i);
+ }
+ Set<String> mBFields = new HashSet<>();
+ for (String n : mB.names()) {
+ if (fieldsToIgnore.contains(n)) {
+ continue;
+ }
+ mBFields.add(n);
+ }
+ assertEquals(mAFields, mBFields);
+ }
+ }
+
/**
* Test that in at least one item in metadataList, all keys and values
* in minExpected are contained.
@@ -315,6 +343,14 @@ public abstract class TikaTest {
return getRecursiveMetadata(filePath, new ParseContext());
}
+ protected List<Metadata> getRecursiveMetadata(String filePath,
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType)
+ throws Exception {
+ return getRecursiveMetadata(filePath, TikaTest.AUTO_DETECT_PARSER, new Metadata(),
+ new ParseContext(), true,
+ handlerType);
+ }
+
protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata)
throws Exception {
return getRecursiveMetadata(filePath, metadata, new ParseContext());
@@ -340,6 +376,16 @@ public abstract class TikaTest {
}
}
+ protected List<Metadata> getRecursiveMetadata(String filePath, Parser wrapped,
+ Metadata metadata, ParseContext context,
+ boolean suppressException,
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType)
+ throws Exception {
+ try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
+ return getRecursiveMetadata(is, wrapped, metadata, context, suppressException, handlerType);
+ }
+ }
+
protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context,
boolean suppressException) throws Exception {
Metadata metadata = new Metadata();
@@ -406,6 +452,23 @@ public abstract class TikaTest {
return handler.getMetadataList();
}
+ protected List<Metadata> getRecursiveMetadata(InputStream is, Parser p, Metadata metadata,
+ ParseContext context, boolean suppressException,
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType)
+ throws Exception {
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
+ RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(handlerType, -1));
+ try {
+ wrapper.parse(is, handler, metadata, context);
+ } catch (Exception e) {
+ if (!suppressException) {
+ throw e;
+ }
+ }
+ return handler.getMetadataList();
+ }
+
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context)
throws Exception {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
new file mode 100644
index 000000000..e3d743ad3
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.gzip;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.utils.IOUtils;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
+
+import org.apache.tika.detect.Detector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * This is designed to detect commonly gzipped file types such as warc.gz.
+ * This is a first step. We still need to implement tar.gz and svg.gz and ???
+ */
+public class GZipSpecializationDetector implements Detector {
+ public static MediaType GZ = MediaType.application("gzip");
+ public static MediaType WARC_GZ = MediaType.application("warc+gz");
+
+ @Override
+ public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+ if (input == null) {
+ return MediaType.OCTET_STREAM;
+ }
+ input.mark(2);
+ byte[] firstTwo = new byte[2];
+ try {
+ IOUtils.readFully(input, firstTwo);
+ } finally {
+ input.reset();
+ }
+ int magic = ((firstTwo[1] & 0xff) << 8) | (firstTwo[0] & 0xff);
+ if (GZIPInputStream.GZIP_MAGIC != magic) {
+ return MediaType.OCTET_STREAM;
+ }
+ return detectSpecialization(input, metadata);
+ }
+
+ private MediaType detectSpecialization(InputStream input, Metadata metadata) throws IOException {
+
+ int buffSize = 1024;
+ UnsynchronizedByteArrayOutputStream gzippedBytes = new UnsynchronizedByteArrayOutputStream();
+ try {
+ IOUtils.copyRange(input, buffSize, gzippedBytes);
+ } catch (IOException e) {
+ //swallow
+ } finally {
+ input.reset();
+ }
+ UnsynchronizedByteArrayOutputStream bytes = new UnsynchronizedByteArrayOutputStream();
+ try (InputStream is = new
+ GzipCompressorInputStream(new UnsynchronizedByteArrayInputStream(gzippedBytes.toByteArray()))) {
+ int c = is.read();
+ //read bytes one at a time to avoid premature EOF from buffering
+ while (c > -1) {
+ bytes.write(c);
+ c = is.read();
+ }
+ } catch (IOException e) {
+ //swallow
+ }
+ //TODO: something better than this
+ String s = new String(bytes.toByteArray(), StandardCharsets.UTF_8);
+ if (s.startsWith("WARC/")) {
+ return WARC_GZ;
+ }
+ return GZ;
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 6b42250b3..77f5b9647 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -86,7 +86,7 @@ public class CompressorParser extends AbstractParser {
private static Set<MediaType> SUPPORTED_TYPES;
private static Map<String, String> MIMES_TO_NAME;
- private boolean decompressConcatenated = true;
+ private boolean decompressConcatenated = false;
static {
Set<MediaType> TMP_SET = new HashSet<>(MediaType
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
new file mode 100644
index 000000000..a5d143217
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.detect.gzip.GZipSpecializationDetector
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml
index c3743cb6f..d194f190e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml
@@ -38,7 +38,7 @@
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</dependency>
- <!-- need these for detection/ungzipping and html parsing in tests -->
+ <!-- need these for detection/gunzipping and html+txt parsing in tests -->
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>tika-parser-html-module</artifactId>
@@ -51,6 +51,12 @@
<version>${project.version}</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
index 7e1dcc17a..8025f6643 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
@@ -52,7 +52,8 @@ import org.apache.tika.utils.StringUtils;
public class WARCParser extends AbstractParser {
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
- new HashSet<>(Arrays.asList(MediaType.application("warc"))));
+ new HashSet<>(Arrays.asList(MediaType.application("warc"),
+ MediaType.application("warc+gz"))));
public static String WARC_PREFIX = "warc:";
public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:";
@@ -137,6 +138,8 @@ public class WARCParser extends AbstractParser {
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(payload.body().size()));
if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
+ //TODO check Content-Encoding on the warcResponse.http.headers and wrap the stream.
+ //May need to sniff first few bytes to confirm accuracy, e.g. gzip compression ?
try (InputStream tis = TikaInputStream.get(payload.body().stream())) {
embeddedDocumentExtractor.parseEmbedded(tis, xhtml, metadata, true);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
index e37203ce6..c92f8ec15 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -18,13 +18,16 @@ package org.apache.tika.parser.warc;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BasicContentHandlerFactory;
public class WARCParserTest extends TikaTest {
@@ -35,14 +38,30 @@ public class WARCParserTest extends TikaTest {
public void testBasic() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("cc.warc.gz");
- assertEquals(3, metadataList.size());
+ assertEquals(2, metadataList.size());
+ assertEquals("application/warc+gz", metadataList.get(0).get(Metadata.CONTENT_TYPE));
assertContains("text/html", metadataList.get(1).get(Metadata.CONTENT_TYPE));
assertContains("Common Crawl on Twitter", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
- assertEquals("application/warc", metadataList.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("<urn:uuid:c3f02271-44d2-4159-9cdb-3e3efeb16ba0>",
metadataList.get(1).get("warc:WARC-Warcinfo-ID"));
assertEquals("http://commoncrawl.org/",
metadataList.get(1).get("warc:WARC-Target-URI"));
+ }
+
+ @Test
+ public void testMultipleRecords() throws Exception {
+ //TIKA-
+ List<Metadata> metadataList = getRecursiveMetadata("testWARC_multiple.warc",
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+ List<Metadata> gzMetadataList = getRecursiveMetadata("testWARC_multiple.warc.gz",
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+
+ Set<String> fieldsToIgnore = new HashSet<>();
+ fieldsToIgnore.add("X-TIKA:parse_time_millis");
+ fieldsToIgnore.add("Content-Type");
+ assertMetadataListEquals(metadataList, gzMetadataList, fieldsToIgnore);
+ assertEquals("application/warc", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+ assertEquals("application/warc+gz", gzMetadataList.get(0).get(Metadata.CONTENT_TYPE));
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc
new file mode 100644
index 000000000..e0bdf7e24
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc.gz
new file mode 100644
index 000000000..4a5dcbf5b
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java
index a95212944..82a9e7df9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java
@@ -32,11 +32,13 @@ public class TestDetectorLoading {
//integration test
Detector detector = TikaConfig.getDefaultConfig().getDetector();
List<Detector> detectors = ((CompositeDetector) detector).getDetectors();
- assertEquals(6, detectors.size());
+ assertEquals(7, detectors.size());
assertEquals("org.gagravarr.tika.OggDetector", detectors.get(0).getClass().getName());
+ assertEquals("org.apache.tika.detect.gzip.GZipSpecializationDetector",
+ detectors.get(2).getClass().getName());
assertEquals("org.apache.tika.detect.microsoft.POIFSContainerDetector",
- detectors.get(2).getClass().getName());
- assertEquals("org.apache.tika.mime.MimeTypes", detectors.get(5).getClass().getName());
+ detectors.get(3).getClass().getName());
+ assertEquals("org.apache.tika.mime.MimeTypes", detectors.get(6).getClass().getName());
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
index fba465882..0bc80263a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
@@ -82,9 +82,17 @@ public class GzipParserTest extends AbstractPkgTest {
}
@Test
- public void testDecompressConcatenatedDefault() throws Exception {
+ public void testDecompressConcatenated() throws Exception {
+ //test default
+ assertEquals(2, getRecursiveMetadata("multiple.gz").size());
+
+ //test config
+ TikaConfig tikaConfig = null;
+ try (InputStream is = getResourceAsStream("/configs/tika-config-multiple-gz.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
assertContains("<p>ab</p>",
- getRecursiveMetadata("multiple.gz").get(1)
+ getRecursiveMetadata("multiple.gz", new AutoDetectParser(tikaConfig)).get(1)
.get(TikaCoreProperties.TIKA_CONTENT));
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml
new file mode 100644
index 000000000..370532af4
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pkg.CompressorParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pkg.CompressorParser">
+ <params>
+ <param name="decompressConcatenated" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>