You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/31 14:17:27 UTC

[tika] 01/01: TIKA-4048 -- change default decompressConcatenated to true in CompressorParser

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4048
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 738018ec60829881250de251f870dffd4380665e
Author: tballison <ta...@apache.org>
AuthorDate: Wed May 31 10:17:15 2023 -0400

    TIKA-4048 -- change default decompressConcatenated to true in CompressorParser
---
 CHANGES.txt                                        |   3 ++
 .../apache/tika/parser/pkg/CompressorParser.java   |  13 ++++++-
 .../apache/tika/parser/pkg/tika-gzip-config.xml    |  29 ++++++++++++++++
 .../src/test/resources/test-documents/multiple.gz  | Bin 0 -> 46 bytes
 .../org/apache/tika/parser/pkg/GzipParserTest.java |  38 +++++++++++++++++++++
 5 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 5526b5f86..732de46c4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 2.8.1 - ???
 
+   * Changed default decompressConcatenated to true in CompressorParser.
+     Users may revert to legacy behavior via tika-config.xml (TIKA-4048).
+
    * Fixed write limit bug in RecursiveParserWrapper (TIKA-4055).
 
    * Add mime detection for many files (TIKA-3992).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index d01959bfc..d6df3c5ff 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -86,6 +86,8 @@ public class CompressorParser extends AbstractParser {
     private static Set<MediaType> SUPPORTED_TYPES;
     private static Map<String, String> MIMES_TO_NAME;
 
+    private boolean decompressConcatenated = true;
+
     static {
         Set<MediaType> TMP_SET = new HashSet<>(MediaType
                 .set(BZIP, BZIP2, DEFLATE64, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS, XZ, PACK,
@@ -177,7 +179,7 @@ public class CompressorParser extends AbstractParser {
         CompressorInputStream cis;
         try {
             CompressorParserOptions options =
-                    context.get(CompressorParserOptions.class, metadata1 -> false);
+                    context.get(CompressorParserOptions.class, metadata1 -> decompressConcatenated);
             CompressorStreamFactory factory =
                     new CompressorStreamFactory(options.decompressConcatenated(metadata),
                             memoryLimitInKb);
@@ -256,4 +258,13 @@ public class CompressorParser extends AbstractParser {
         return this.memoryLimitInKb;
     }
 
+    @Field
+    public void setDecompressConcatenated(boolean decompressConcatenated) {
+        this.decompressConcatenated = decompressConcatenated;
+    }
+
+    public boolean isDecompressConcatenated() {
+        return this.decompressConcatenated;
+    }
+
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-gzip-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-gzip-config.xml
new file mode 100644
index 000000000..bbf3b6458
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-gzip-config.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.pkg.CompressorParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.pkg.CompressorParser">
+      <params>
+        <param name="decompressConcatenated" type="bool">false</param>
+      </params>
+    </parser>
+  </parsers>
+</properties>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/multiple.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/multiple.gz
new file mode 100644
index 000000000..f5fd0675e
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/multiple.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
index fa2421b1f..fba465882 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
@@ -23,7 +23,12 @@ import java.io.InputStream;
 import org.junit.jupiter.api.Test;
 import org.xml.sax.ContentHandler;
 
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 
 /**
@@ -76,4 +81,37 @@ public class GzipParserTest extends AbstractPkgTest {
         assertContains("Test SVG image", content);
     }
 
+    @Test
+    public void testDecompressConcatenatedDefault() throws Exception {
+        assertContains("<p>ab</p>",
+                getRecursiveMetadata("multiple.gz").get(1)
+                        .get(TikaCoreProperties.TIKA_CONTENT));
+    }
+
+    @Test
+    public void testDecompressConcatenatedOffInParseContext() throws Exception {
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(CompressorParserOptions.class, new CompressorParserOptions() {
+            @Override
+            public boolean decompressConcatenated(Metadata metadata) {
+                return false;
+            }
+        });
+        assertContains("<p>a</p>",
+                getRecursiveMetadata("multiple.gz", parseContext).get(1)
+                        .get(TikaCoreProperties.TIKA_CONTENT));
+    }
+
+    @Test
+    public void testDecompressConcatenatedOffInTikaConfig() throws Exception {
+
+        TikaConfig tikaConfig = null;
+        try (InputStream is = getResourceAsStream("tika-gzip-config.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(tikaConfig);
+        assertContains("<p>a</p>",
+                getRecursiveMetadata("multiple.gz", p).get(1)
+                        .get(TikaCoreProperties.TIKA_CONTENT));
+    }
 }