You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/31 14:17:26 UTC

[tika] branch TIKA-4048 created (now 738018ec6)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4048
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 738018ec6 TIKA-4048 -- change default decompressConcatenated to true in CompressorParser

This branch includes the following new commits:

     new 738018ec6 TIKA-4048 -- change default decompressConcatenated to true in CompressorParser

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4048 -- change default decompressConcatenated to true in CompressorParser

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4048
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 738018ec60829881250de251f870dffd4380665e
Author: tballison <ta...@apache.org>
AuthorDate: Wed May 31 10:17:15 2023 -0400

    TIKA-4048 -- change default decompressConcatenated to true in CompressorParser
---
 CHANGES.txt                                        |   3 ++
 .../apache/tika/parser/pkg/CompressorParser.java   |  13 ++++++-
 .../apache/tika/parser/pkg/tika-gzip-config.xml    |  29 ++++++++++++++++
 .../src/test/resources/test-documents/multiple.gz  | Bin 0 -> 46 bytes
 .../org/apache/tika/parser/pkg/GzipParserTest.java |  38 +++++++++++++++++++++
 5 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 5526b5f86..732de46c4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 2.8.1 - ???
 
+   * Changed default decompressConcatenated to true in CompressorParser.
+     Users may revert to legacy behavior via tika-config.xml (TIKA-4048).
+
    * Fixed write limit bug in RecursiveParserWrapper (TIKA-4055).
 
    * Add mime detection for many files (TIKA-3992).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index d01959bfc..d6df3c5ff 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -86,6 +86,8 @@ public class CompressorParser extends AbstractParser {
     private static Set<MediaType> SUPPORTED_TYPES;
     private static Map<String, String> MIMES_TO_NAME;
 
+    private boolean decompressConcatenated = true;
+
     static {
         Set<MediaType> TMP_SET = new HashSet<>(MediaType
                 .set(BZIP, BZIP2, DEFLATE64, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS, XZ, PACK,
@@ -177,7 +179,7 @@ public class CompressorParser extends AbstractParser {
         CompressorInputStream cis;
         try {
             CompressorParserOptions options =
-                    context.get(CompressorParserOptions.class, metadata1 -> false);
+                    context.get(CompressorParserOptions.class, metadata1 -> decompressConcatenated);
             CompressorStreamFactory factory =
                     new CompressorStreamFactory(options.decompressConcatenated(metadata),
                             memoryLimitInKb);
@@ -256,4 +258,13 @@ public class CompressorParser extends AbstractParser {
         return this.memoryLimitInKb;
     }
 
+    @Field
+    public void setDecompressConcatenated(boolean decompressConcatenated) {
+        this.decompressConcatenated = decompressConcatenated;
+    }
+
+    public boolean isDecompressConcatenated() {
+        return this.decompressConcatenated;
+    }
+
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-gzip-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-gzip-config.xml
new file mode 100644
index 000000000..bbf3b6458
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-gzip-config.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.pkg.CompressorParser"/>
+    </parser>
+    <parser class="org.apache.tika.parser.pkg.CompressorParser">
+      <params>
+        <param name="decompressConcatenated" type="bool">false</param>
+      </params>
+    </parser>
+  </parsers>
+</properties>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/multiple.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/multiple.gz
new file mode 100644
index 000000000..f5fd0675e
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/multiple.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
index fa2421b1f..fba465882 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
@@ -23,7 +23,12 @@ import java.io.InputStream;
 import org.junit.jupiter.api.Test;
 import org.xml.sax.ContentHandler;
 
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 
 /**
@@ -76,4 +81,37 @@ public class GzipParserTest extends AbstractPkgTest {
         assertContains("Test SVG image", content);
     }
 
+    @Test
+    public void testDecompressConcatenatedDefault() throws Exception {
+        assertContains("<p>ab</p>",
+                getRecursiveMetadata("multiple.gz").get(1)
+                        .get(TikaCoreProperties.TIKA_CONTENT));
+    }
+
+    @Test
+    public void testDecompressConcatenatedOffInParseContext() throws Exception {
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(CompressorParserOptions.class, new CompressorParserOptions() {
+            @Override
+            public boolean decompressConcatenated(Metadata metadata) {
+                return false;
+            }
+        });
+        assertContains("<p>a</p>",
+                getRecursiveMetadata("multiple.gz", parseContext).get(1)
+                        .get(TikaCoreProperties.TIKA_CONTENT));
+    }
+
+    @Test
+    public void testDecompressConcatenatedOffInTikaConfig() throws Exception {
+
+        TikaConfig tikaConfig = null;
+        try (InputStream is = getResourceAsStream("tika-gzip-config.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(tikaConfig);
+        assertContains("<p>a</p>",
+                getRecursiveMetadata("multiple.gz", p).get(1)
+                        .get(TikaCoreProperties.TIKA_CONTENT));
+    }
 }