You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/08/16 17:27:58 UTC

[tika] 01/01: TIKA-4048 -- revert change in default decompressConcatenated and add a gzip subtype detector for warc+gz

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4048
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 46832d718eb1572d3cca4d7d18415afff336d4c6
Author: tallison <ta...@apache.org>
AuthorDate: Wed Aug 16 13:27:43 2023 -0400

    TIKA-4048 -- revert change in default decompressConcatenated and add a gzip subtype detector for warc+gz
---
 CHANGES.txt                                        |   3 +-
 .../org/apache/tika/mime/tika-mimetypes.xml        |   6 ++
 .../src/test/java/org/apache/tika/TikaTest.java    |  63 +++++++++++++++
 .../detect/gzip/GZipSpecializationDetector.java    |  90 +++++++++++++++++++++
 .../apache/tika/parser/pkg/CompressorParser.java   |   2 +-
 .../services/org.apache.tika.detect.Detector       |  16 ++++
 .../tika-parser-webarchive-module/pom.xml          |   8 +-
 .../org/apache/tika/parser/warc/WARCParser.java    |   5 +-
 .../apache/tika/parser/warc/WARCParserTest.java    |  23 +++++-
 .../test-documents/testWARC_multiple.warc          | Bin 0 -> 6773 bytes
 .../test-documents/testWARC_multiple.warc.gz       | Bin 0 -> 5907 bytes
 .../apache/tika/detect/TestDetectorLoading.java    |   8 +-
 .../org/apache/tika/parser/pkg/GzipParserTest.java |  12 ++-
 .../resources/configs/tika-config-multiple-gz.xml  |  29 +++++++
 14 files changed, 253 insertions(+), 12 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 55bd83671..015a55e43 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,8 +7,7 @@ Release 2.8.1 - ???
 
    * Fix bug that led to duplicate extraction of macros from some OLE2 containers (TIKA-4116).
 
-   * Changed default decompressConcatenated to true in CompressorParser.
-     Users may revert to legacy behavior via tika-config.xml (TIKA-4048).
+   * Add detection of warc.gz as a specialization of gz and parse as if a standard WARC (TIKA-4048).
 
    * Allow users to modify the attachment limit size in the /unpack resource (TIKA-4039)
    
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 362ace8c1..47203a163 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3212,6 +3212,12 @@
     <glob pattern="*.warc"/>
   </mime-type>
 
+  <mime-type type="application/warc+gz">
+    <acronym>WARC</acronym>
+    <_comment>WARC</_comment>
+    <glob pattern="*.warc.gz"/>
+  </mime-type>
+
   <mime-type type="application/wasm">
     <acronym>Wasm</acronym>
     <_comment>Web Assembly</_comment>
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index a00d7b2b0..c20229b59 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -16,6 +16,7 @@
  */
 package org.apache.tika;
 
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -101,6 +102,33 @@ public abstract class TikaTest {
         assertFalse(haystack.contains(needle), needle + " unexpectedly found in:\n" + haystack);
     }
 
+    public static void assertMetadataListEquals(List<Metadata> metadataListA,
+                                          List<Metadata> metadataListB,
+                                    Set<String> fieldsToIgnore) {
+        assertEquals(metadataListA.size(), metadataListB.size(), "different sizes");
+        for (int i = 0; i < metadataListA.size(); i++) {
+            Metadata mA = metadataListA.get(i);
+            Metadata mB = metadataListB.get(i);
+            Set<String> mAFields = new HashSet<>();
+            for (String n : mA.names()) {
+                if (fieldsToIgnore.contains(n)) {
+                    continue;
+                }
+                mAFields.add(n);
+                assertArrayEquals(mA.getValues(n), mB.getValues(n), "problem with " + n +
+                        " in metadata index=" + i);
+            }
+            Set<String> mBFields = new HashSet<>();
+            for (String n : mB.names()) {
+                if (fieldsToIgnore.contains(n)) {
+                    continue;
+                }
+                mBFields.add(n);
+            }
+            assertEquals(mAFields, mBFields);
+        }
+    }
+
     /**
      * Test that in at least one item in metadataList, all keys and values
      * in minExpected are contained.
@@ -315,6 +343,14 @@ public abstract class TikaTest {
         return getRecursiveMetadata(filePath, new ParseContext());
     }
 
+    protected List<Metadata> getRecursiveMetadata(String filePath,
+                                                  BasicContentHandlerFactory.HANDLER_TYPE handlerType)
+            throws Exception {
+        return getRecursiveMetadata(filePath, TikaTest.AUTO_DETECT_PARSER, new Metadata(),
+                new ParseContext(), true,
+                handlerType);
+    }
+
     protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata)
             throws Exception {
         return getRecursiveMetadata(filePath, metadata, new ParseContext());
@@ -340,6 +376,16 @@ public abstract class TikaTest {
         }
     }
 
+    protected List<Metadata> getRecursiveMetadata(String filePath, Parser wrapped,
+                                                  Metadata metadata, ParseContext context,
+                                                  boolean suppressException,
+                                                  BasicContentHandlerFactory.HANDLER_TYPE handlerType)
+            throws Exception {
+        try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
+            return getRecursiveMetadata(is, wrapped, metadata, context, suppressException, handlerType);
+        }
+    }
+
     protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context,
                                                   boolean suppressException) throws Exception {
         Metadata metadata = new Metadata();
@@ -406,6 +452,23 @@ public abstract class TikaTest {
         return handler.getMetadataList();
     }
 
+    protected List<Metadata> getRecursiveMetadata(InputStream is, Parser p, Metadata metadata,
+                                                  ParseContext context, boolean suppressException,
+                                                  BasicContentHandlerFactory.HANDLER_TYPE handlerType)
+            throws Exception {
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
+        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
+                new BasicContentHandlerFactory(handlerType, -1));
+        try {
+            wrapper.parse(is, handler, metadata, context);
+        } catch (Exception e) {
+            if (!suppressException) {
+                throw e;
+            }
+        }
+        return handler.getMetadataList();
+    }
+
     protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context)
             throws Exception {
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
new file mode 100644
index 000000000..e3d743ad3
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/detect/gzip/GZipSpecializationDetector.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect.gzip;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.utils.IOUtils;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
+
+import org.apache.tika.detect.Detector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+
+/**
+ * This is designed to detect commonly gzipped file types such as warc.gz.
+ * This is a first step.  We still need to implement tar.gz and svg.gz and ???
+ */
+public class GZipSpecializationDetector implements Detector {
+    public static MediaType GZ = MediaType.application("gzip");
+    public static MediaType WARC_GZ = MediaType.application("warc+gz");
+
+    @Override
+    public MediaType detect(InputStream input, Metadata metadata) throws IOException {
+        if (input == null) {
+            return MediaType.OCTET_STREAM;
+        }
+        input.mark(2);
+        byte[] firstTwo = new byte[2];
+        try {
+            IOUtils.readFully(input, firstTwo);
+        } finally {
+            input.reset();
+        }
+        int magic = ((firstTwo[1] & 0xff) << 8) | (firstTwo[0] & 0xff);
+        if (GZIPInputStream.GZIP_MAGIC != magic) {
+            return MediaType.OCTET_STREAM;
+        }
+        return detectSpecialization(input, metadata);
+    }
+
+    private MediaType detectSpecialization(InputStream input, Metadata metadata) throws IOException {
+
+        int buffSize = 1024;
+        UnsynchronizedByteArrayOutputStream gzippedBytes = new UnsynchronizedByteArrayOutputStream();
+        try {
+            IOUtils.copyRange(input, buffSize, gzippedBytes);
+        } catch (IOException e) {
+            //swallow
+        } finally {
+            input.reset();
+        }
+        UnsynchronizedByteArrayOutputStream bytes = new UnsynchronizedByteArrayOutputStream();
+        try (InputStream is = new
+                     GzipCompressorInputStream(new UnsynchronizedByteArrayInputStream(gzippedBytes.toByteArray()))) {
+            int c = is.read();
+            //read bytes one at a time to avoid premature EOF from buffering
+            while (c > -1) {
+                bytes.write(c);
+                c = is.read();
+            }
+        } catch (IOException e) {
+            //swallow
+        }
+        //TODO: something better than this
+        String s = new String(bytes.toByteArray(), StandardCharsets.UTF_8);
+        if (s.startsWith("WARC/")) {
+            return WARC_GZ;
+        }
+        return GZ;
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 6b42250b3..77f5b9647 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -86,7 +86,7 @@ public class CompressorParser extends AbstractParser {
     private static Set<MediaType> SUPPORTED_TYPES;
     private static Map<String, String> MIMES_TO_NAME;
 
-    private boolean decompressConcatenated = true;
+    private boolean decompressConcatenated = false;
 
     static {
         Set<MediaType> TMP_SET = new HashSet<>(MediaType
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
new file mode 100644
index 000000000..a5d143217
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.detect.Detector
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.detect.gzip.GZipSpecializationDetector
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml
index c3743cb6f..d194f190e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/pom.xml
@@ -38,7 +38,7 @@
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-compress</artifactId>
     </dependency>
-    <!-- need these for detection/ungzipping and html parsing in tests -->
+    <!-- need these for detection/gunzipping and html+txt parsing in tests -->
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-parser-html-module</artifactId>
@@ -51,6 +51,12 @@
       <version>${project.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parser-text-module</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 
   <build>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
index 7e1dcc17a..8025f6643 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/warc/WARCParser.java
@@ -52,7 +52,8 @@ import org.apache.tika.utils.StringUtils;
 public class WARCParser extends AbstractParser {
 
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
-            new HashSet<>(Arrays.asList(MediaType.application("warc"))));
+            new HashSet<>(Arrays.asList(MediaType.application("warc"),
+                    MediaType.application("warc+gz"))));
 
     public static String WARC_PREFIX = "warc:";
     public static String WARC_HTTP_PREFIX = WARC_PREFIX + "http:";
@@ -137,6 +138,8 @@ public class WARCParser extends AbstractParser {
         metadata.set(Metadata.CONTENT_LENGTH, Long.toString(payload.body().size()));
 
         if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
+            //TODO check Content-Encoding on the warcResponse.http.headers and wrap the stream.
+            //May need to sniff first few bytes to confirm accuracy, e.g. gzip compression ?
             try (InputStream tis = TikaInputStream.get(payload.body().stream())) {
                 embeddedDocumentExtractor.parseEmbedded(tis, xhtml, metadata, true);
             }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
index e37203ce6..c92f8ec15 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/java/org/apache/tika/parser/warc/WARCParserTest.java
@@ -18,13 +18,16 @@ package org.apache.tika.parser.warc;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.BasicContentHandlerFactory;
 
 public class WARCParserTest extends TikaTest {
 
@@ -35,14 +38,30 @@ public class WARCParserTest extends TikaTest {
     public void testBasic() throws Exception {
 
         List<Metadata> metadataList = getRecursiveMetadata("cc.warc.gz");
-        assertEquals(3, metadataList.size());
+        assertEquals(2, metadataList.size());
+        assertEquals("application/warc+gz", metadataList.get(0).get(Metadata.CONTENT_TYPE));
         assertContains("text/html", metadataList.get(1).get(Metadata.CONTENT_TYPE));
         assertContains("Common Crawl on Twitter", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
-        assertEquals("application/warc", metadataList.get(2).get(Metadata.CONTENT_TYPE));
         assertEquals("<urn:uuid:c3f02271-44d2-4159-9cdb-3e3efeb16ba0>",
                 metadataList.get(1).get("warc:WARC-Warcinfo-ID"));
         assertEquals("http://commoncrawl.org/",
                 metadataList.get(1).get("warc:WARC-Target-URI"));
+    }
+
+    @Test
+    public void testMultipleRecords() throws Exception {
+        //TIKA-
+        List<Metadata> metadataList = getRecursiveMetadata("testWARC_multiple.warc",
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+        List<Metadata> gzMetadataList = getRecursiveMetadata("testWARC_multiple.warc.gz",
+                BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+
+        Set<String> fieldsToIgnore = new HashSet<>();
+        fieldsToIgnore.add("X-TIKA:parse_time_millis");
+        fieldsToIgnore.add("Content-Type");
+        assertMetadataListEquals(metadataList, gzMetadataList, fieldsToIgnore);
 
+        assertEquals("application/warc", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+        assertEquals("application/warc+gz", gzMetadataList.get(0).get(Metadata.CONTENT_TYPE));
     }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc
new file mode 100644
index 000000000..e0bdf7e24
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc.gz
new file mode 100644
index 000000000..4a5dcbf5b
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/test/resources/test-documents/testWARC_multiple.warc.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java
index a95212944..82a9e7df9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestDetectorLoading.java
@@ -32,11 +32,13 @@ public class TestDetectorLoading {
         //integration test
         Detector detector = TikaConfig.getDefaultConfig().getDetector();
         List<Detector> detectors = ((CompositeDetector) detector).getDetectors();
-        assertEquals(6, detectors.size());
+        assertEquals(7, detectors.size());
         assertEquals("org.gagravarr.tika.OggDetector", detectors.get(0).getClass().getName());
+        assertEquals("org.apache.tika.detect.gzip.GZipSpecializationDetector",
+                detectors.get(2).getClass().getName());
 
         assertEquals("org.apache.tika.detect.microsoft.POIFSContainerDetector",
-                detectors.get(2).getClass().getName());
-        assertEquals("org.apache.tika.mime.MimeTypes", detectors.get(5).getClass().getName());
+                detectors.get(3).getClass().getName());
+        assertEquals("org.apache.tika.mime.MimeTypes", detectors.get(6).getClass().getName());
     }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
index fba465882..0bc80263a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
@@ -82,9 +82,17 @@ public class GzipParserTest extends AbstractPkgTest {
     }
 
     @Test
-    public void testDecompressConcatenatedDefault() throws Exception {
+    public void testDecompressConcatenated() throws Exception {
+        //test default
+        assertEquals(2, getRecursiveMetadata("multiple.gz").size());
+
+        //test config
+        TikaConfig tikaConfig = null;
+        try (InputStream is = getResourceAsStream("/configs/tika-config-multiple-gz.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
         assertContains("<p>ab</p>",
-                getRecursiveMetadata("multiple.gz").get(1)
+                getRecursiveMetadata("multiple.gz", new AutoDetectParser(tikaConfig)).get(1)
                         .get(TikaCoreProperties.TIKA_CONTENT));
     }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml
new file mode 100644
index 000000000..370532af4
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-multiple-gz.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.pkg.CompressorParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.pkg.CompressorParser">
+      <params>
+        <param name="decompressConcatenated" type="bool">true</param>
+      </params>
+    </parser>
+  </parsers>
+</properties>