You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/07 00:54:30 UTC
[tika] 01/02: TIKA-2576 -- Upgrade commons compress and add
detection and parsing of zstd (if user provides
com.github.luben:zstd-jni... via Andreas Meier
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit be6e95d45bdfc40f35e93b26e45533d0a78ebd48
Author: tballison <ta...@mitre.org>
AuthorDate: Tue Mar 6 15:50:06 2018 -0500
TIKA-2576 -- Upgrade commons compress and add detection and parsing of zstd (if user provides com.github.luben:zstd-jni... via Andreas Meier
---
CHANGES.txt | 3 +++
.../resources/org/apache/tika/mime/tika-mimetypes.xml | 9 ++++++++-
tika-parent/pom.xml | 2 +-
tika-parsers/pom.xml | 8 +++++++-
.../java/org/apache/tika/parser/pkg/CompressorParser.java | 8 +++++++-
.../src/test/java/org/apache/tika/mime/TestMimeTypes.java | 2 ++
.../org/apache/tika/parser/pkg/CompressorParserTest.java | 7 +++++++
.../src/test/resources/test-documents/testZSTD.zstd | Bin 0 -> 143 bytes
8 files changed, 35 insertions(+), 4 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 7b78929..d553961 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.18 - ???
+ * Add detection and parsing of zstd (if user provides
+ com.github.luben:zstd-jni) via Andreas Meier (TIKA-2576)
+
* Allow for RFC822 detection for files starting with "dkim-"
and/or "x-" via Andreas Meier (TIKA-2578 and TIKA-2587)
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 7432a56..f6a8844 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3453,7 +3453,14 @@
<glob pattern="*.tgz" />
<glob pattern="*-gz" />
</mime-type>
-
+ <mime-type type="application/zstd">
+ <_comment>https://en.wikipedia.org/wiki/Zstandard</_comment>
+ <_comment>https://tools.ietf.org/id/draft-kucherawy-dispatch-zstd-01.html</_comment>
+ <magic priority="50">
+ <match value="0xFD2FB528" type="little32" offset="0"/>
+ </magic>
+ <glob pattern="*.zstd"/>
+ </mime-type>
<mime-type type="application/x-hdf">
<_comment>Hierarchical Data Format File</_comment>
<magic priority="50">
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 2cbcf4e..03c8ea0 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -306,7 +306,7 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
<!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
- <commons.compress.version>1.14</commons.compress.version>
+ <commons.compress.version>1.16.1</commons.compress.version>
<commons.io.version>2.6</commons.io.version>
<gson.version>2.8.1</gson.version>
<cxf.version>3.0.16</cxf.version>
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index 271ec07..35787cb 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -39,7 +39,7 @@
<!-- NOTE: sync codec version with POI -->
<codec.version>1.10</codec.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
- <tukaani.version>1.6</tukaani.version>
+ <tukaani.version>1.8</tukaani.version>
<mime4j.version>0.8.1</mime4j.version>
<vorbis.version>0.8</vorbis.version>
<pdfbox.version>2.0.8</pdfbox.version>
@@ -150,6 +150,12 @@
<artifactId>xz</artifactId>
<version>${tukaani.version}</version>
</dependency>
+ <dependency>
+ <groupId>com.github.luben</groupId>
+ <artifactId>zstd-jni</artifactId>
+ <version>1.3.3-3</version>
+ <scope>provided</scope>
+ </dependency>
<dependency>
<groupId>commons-codec</groupId>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 48f8bec..ada7ec9 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -75,10 +75,12 @@ public class CompressorParser extends AbstractParser {
private static final MediaType ZLIB = MediaType.application("zlib");
private static final MediaType LZMA = MediaType.application("x-lzma");
private static final MediaType LZ4_FRAMED = MediaType.application("x-lz4");
+ private static final MediaType ZSTD = MediaType.application("zstd");
+ private static final MediaType DEFLATE64= MediaType.application("deflate64");
private static final Set<MediaType> SUPPORTED_TYPES =
MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
- XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA);
+ XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA, ZSTD);
private int memoryLimitInKb = 100000;//100MB
@@ -141,6 +143,10 @@ public class CompressorParser extends AbstractParser {
return SNAPPY_RAW;
} else if (CompressorStreamFactory.LZMA.equals(name)) {
return LZMA;
+ } else if (CompressorStreamFactory.ZSTANDARD.equals(name)) {
+ return ZSTD;
+ } else if (CompressorStreamFactory.DEFLATE64.equals(name)) {
+ return DEFLATE64;
} else {
return MediaType.OCTET_STREAM;
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
index e76a7d5..bbb25e5 100644
--- a/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
+++ b/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
@@ -379,6 +379,8 @@ public class TestMimeTypes {
// For spanned zip files, the .zip file doesn't have the header, it's the other parts
assertTypeByData("application/octet-stream", "test-documents-spanned.zip");
assertTypeByData("application/zip", "test-documents-spanned.z01");
+
+ assertTypeDetection("testZSTD.zstd", "application/zstd");
}
@Test
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
index 444afc7..26552eb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
@@ -42,6 +42,7 @@ public class CompressorParserTest extends TikaTest {
NOT_COVERED.add(MediaType.application("x-brotli"));
NOT_COVERED.add(MediaType.application("x-lz4-block"));
NOT_COVERED.add(MediaType.application("x-snappy-raw"));
+ NOT_COVERED.add(MediaType.application("deflate64"));
}
@Test
@@ -61,6 +62,12 @@ public class CompressorParserTest extends TikaTest {
}
@Test
+ public void testZstd() throws Exception {
+ XMLResult r = getXML("testZSTD.zstd");
+ assertContains("0123456789", r.xml);
+ }
+
+ @Test
public void testCoverage() throws Exception {
//test that the package parser covers all inputstreams handled
//by CompressorStreamFactory. When we update commons-compress, and they add
diff --git a/tika-parsers/src/test/resources/test-documents/testZSTD.zstd b/tika-parsers/src/test/resources/test-documents/testZSTD.zstd
new file mode 100644
index 0000000..f594f1a
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testZSTD.zstd differ
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.