You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/31 14:17:27 UTC
[tika] 01/01: TIKA-4048 -- change default decompressConcatenated to true in CompressorParser
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4048
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 738018ec60829881250de251f870dffd4380665e
Author: tballison <ta...@apache.org>
AuthorDate: Wed May 31 10:17:15 2023 -0400
TIKA-4048 -- change default decompressConcatenated to true in CompressorParser
---
CHANGES.txt | 3 ++
.../apache/tika/parser/pkg/CompressorParser.java | 13 ++++++-
.../apache/tika/parser/pkg/tika-gzip-config.xml | 29 ++++++++++++++++
.../src/test/resources/test-documents/multiple.gz | Bin 0 -> 46 bytes
.../org/apache/tika/parser/pkg/GzipParserTest.java | 38 +++++++++++++++++++++
5 files changed, 82 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 5526b5f86..732de46c4 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 2.8.1 - ???
+ * Changed default decompressConcatenated to true in CompressorParser.
+ Users may revert to legacy behavior via tika-config.xml (TIKA-4048).
+
* Fixed write limit bug in RecursiveParserWrapper (TIKA-4055).
* Add mime detection for many files (TIKA-3992).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index d01959bfc..d6df3c5ff 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -86,6 +86,8 @@ public class CompressorParser extends AbstractParser {
private static Set<MediaType> SUPPORTED_TYPES;
private static Map<String, String> MIMES_TO_NAME;
+ private boolean decompressConcatenated = true;
+
static {
Set<MediaType> TMP_SET = new HashSet<>(MediaType
.set(BZIP, BZIP2, DEFLATE64, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS, XZ, PACK,
@@ -177,7 +179,7 @@ public class CompressorParser extends AbstractParser {
CompressorInputStream cis;
try {
CompressorParserOptions options =
- context.get(CompressorParserOptions.class, metadata1 -> false);
+ context.get(CompressorParserOptions.class, metadata1 -> decompressConcatenated);
CompressorStreamFactory factory =
new CompressorStreamFactory(options.decompressConcatenated(metadata),
memoryLimitInKb);
@@ -256,4 +258,13 @@ public class CompressorParser extends AbstractParser {
return this.memoryLimitInKb;
}
+ @Field
+ public void setDecompressConcatenated(boolean decompressConcatenated) {
+ this.decompressConcatenated = decompressConcatenated;
+ }
+
+ public boolean isDecompressConcatenated() {
+ return this.decompressConcatenated;
+ }
+
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-gzip-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-gzip-config.xml
new file mode 100644
index 000000000..bbf3b6458
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/org/apache/tika/parser/pkg/tika-gzip-config.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pkg.CompressorParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pkg.CompressorParser">
+ <params>
+ <param name="decompressConcatenated" type="bool">false</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
\ No newline at end of file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/multiple.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/multiple.gz
new file mode 100644
index 000000000..f5fd0675e
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/multiple.gz differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
index fa2421b1f..fba465882 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/GzipParserTest.java
@@ -23,7 +23,12 @@ import java.io.InputStream;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
/**
@@ -76,4 +81,37 @@ public class GzipParserTest extends AbstractPkgTest {
assertContains("Test SVG image", content);
}
+ @Test
+ public void testDecompressConcatenatedDefault() throws Exception {
+ assertContains("<p>ab</p>",
+ getRecursiveMetadata("multiple.gz").get(1)
+ .get(TikaCoreProperties.TIKA_CONTENT));
+ }
+
+ @Test
+ public void testDecompressConcatenatedOffInParseContext() throws Exception {
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(CompressorParserOptions.class, new CompressorParserOptions() {
+ @Override
+ public boolean decompressConcatenated(Metadata metadata) {
+ return false;
+ }
+ });
+ assertContains("<p>a</p>",
+ getRecursiveMetadata("multiple.gz", parseContext).get(1)
+ .get(TikaCoreProperties.TIKA_CONTENT));
+ }
+
+ @Test
+ public void testDecompressConcatenatedOffInTikaConfig() throws Exception {
+
+ TikaConfig tikaConfig = null;
+ try (InputStream is = getResourceAsStream("tika-gzip-config.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ Parser p = new AutoDetectParser(tikaConfig);
+ assertContains("<p>a</p>",
+ getRecursiveMetadata("multiple.gz", p).get(1)
+ .get(TikaCoreProperties.TIKA_CONTENT));
+ }
}