You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/29 19:15:24 UTC
[tika] 04/04: TIKA-2621 -- add support for brotli
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 04225d2834104c973e6cff421c283af876b2e398
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Mar 29 13:49:59 2018 -0400
TIKA-2621 -- add support for brotli
---
.../org/apache/tika/mime/tika-mimetypes.xml | 14 ++++
.../src/test/java/org/apache/tika/TikaTest.java | 14 ++++
tika-parsers/pom.xml | 7 ++
.../apache/tika/parser/pkg/CompressorParser.java | 82 +++++++++++++++++++--
.../tika/parser/pkg/CompressorParserTest.java | 22 +++++-
.../test-documents/testBROTLI_compressed.br | Bin 0 -> 12 bytes
6 files changed, 130 insertions(+), 9 deletions(-)
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 346eb73..634d9d1 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3135,6 +3135,20 @@
<match value="bplist" type="string" offset="0"/>
</magic>
</mime-type>
+ <mime-type type="application/x-gtar">
+ <_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
+ <magic priority="50">
+ <!-- GNU tar archive -->
+ <match value="ustar \0" type="string" offset="257" />
+ </magic>
+ <glob pattern="*.gtar"/>
+ <sub-class-of type="application/x-tar"/>
+ </mime-type>
+
+ <mime-type type="application/x-brotli">
+ <glob pattern="*.br" />
+ <glob pattern="*.brotli" />
+ </mime-type>
<mime-type type="application/x-bzip">
<magic priority="40">
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 153a564..9c827f7 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -213,6 +213,20 @@ public abstract class TikaTest {
return getRecursiveMetadata(filePath, new ParseContext());
}
+ protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata) throws Exception {
+ return getRecursiveMetadata(filePath, new ParseContext(), metadata);
+ }
+
+ protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
+ Parser p = new AutoDetectParser();
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
+ wrapper.parse(is, new DefaultHandler(), metadata, context);
+ }
+ return wrapper.getMetadata();
+ }
+
protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
Parser p = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index a3e9e4d..e6c7720 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -40,6 +40,8 @@
<codec.version>1.10</codec.version>
<!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
<tukaani.version>1.8</tukaani.version>
+ <!-- NOTE: sync brotli version with commons-compress in tika-parent-->
+ <brotli.version>0.1.2</brotli.version>
<mime4j.version>0.8.1</mime4j.version>
<vorbis.version>0.8</vorbis.version>
<pdfbox.version>2.0.9</pdfbox.version>
@@ -151,6 +153,11 @@
<version>${tukaani.version}</version>
</dependency>
<dependency>
+ <groupId>org.brotli</groupId>
+ <artifactId>dec</artifactId>
+ <version>${brotli.version}</version>
+ </dependency>
+ <dependency>
<groupId>com.github.luben</groupId>
<artifactId>zstd-jni</artifactId>
<version>1.3.3-3</version>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index ada7ec9..658d04c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -21,6 +21,10 @@ import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
import java.util.Set;
import org.apache.commons.compress.MemoryLimitException;
@@ -78,9 +82,47 @@ public class CompressorParser extends AbstractParser {
private static final MediaType ZSTD = MediaType.application("zstd");
private static final MediaType DEFLATE64= MediaType.application("deflate64");
- private static final Set<MediaType> SUPPORTED_TYPES =
- MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
- XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA, ZSTD);
+ private static Set<MediaType> SUPPORTED_TYPES;
+ private static Map<String, String> MIMES_TO_NAME;
+
+ static {
+ Set<MediaType> TMP_SET = new HashSet<>();
+ TMP_SET.addAll(
+ MediaType.set(BZIP, BZIP2, DEFLATE64, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
+ XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA));
+ try {
+ Class.forName("org.brotli.dec.BrotliInputStream");
+ TMP_SET.add(BROTLI);
+ } catch (NoClassDefFoundError|ClassNotFoundException e) {
+ //swallow
+ }
+ try {
+ Class.forName("com.github.luben.zstd.ZstdInputStream");
+ TMP_SET.add(ZSTD);
+ } catch (NoClassDefFoundError|ClassNotFoundException e) {
+ //swallow
+ }
+ SUPPORTED_TYPES = Collections.unmodifiableSet(TMP_SET);
+ }
+
+ static {
+ //map the mime type strings to the compressor stream names
+ Map<String, String> tmpMimesToName = new HashMap<>();
+ tmpMimesToName.put(BZIP2.toString(), CompressorStreamFactory.BZIP2);
+ tmpMimesToName.put(GZIP.toString(), CompressorStreamFactory.GZIP);
+ tmpMimesToName.put(LZ4_FRAMED.toString(), CompressorStreamFactory.LZ4_FRAMED);
+ tmpMimesToName.put(LZ4_BLOCK.toString(), CompressorStreamFactory.LZ4_BLOCK);
+ tmpMimesToName.put(XZ.toString(), CompressorStreamFactory.XZ);
+ tmpMimesToName.put(PACK.toString(), CompressorStreamFactory.PACK200);
+ tmpMimesToName.put(SNAPPY_FRAMED.toString(), CompressorStreamFactory.SNAPPY_FRAMED);
+ tmpMimesToName.put(ZLIB.toString(), CompressorStreamFactory.DEFLATE);
+ tmpMimesToName.put(COMPRESS.toString(), CompressorStreamFactory.Z);
+ tmpMimesToName.put(LZMA.toString(), CompressorStreamFactory.LZMA);
+ tmpMimesToName.put(BROTLI.toString(), CompressorStreamFactory.BROTLI);
+ tmpMimesToName.put(ZSTD.toString(), CompressorStreamFactory.ZSTANDARD);
+ MIMES_TO_NAME = Collections.unmodifiableMap(tmpMimesToName);
+ }
+
private int memoryLimitInKb = 100000;//100MB
@@ -181,7 +223,19 @@ public class CompressorParser extends AbstractParser {
});
CompressorStreamFactory factory =
new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
- cis = factory.createCompressorInputStream(stream);
+ //if we've already identified it via autodetect
+ //trust that and go with the appropriate name
+ //to avoid calling CompressorStreamFactory.detect() twice
+ String name = getStreamName(metadata);
+ if (name != null) {
+ cis = factory.createCompressorInputStream(name, stream);
+ } else {
+ cis = factory.createCompressorInputStream(stream);
+ MediaType type = getMediaType(cis);
+ if (!type.equals(MediaType.OCTET_STREAM)) {
+ metadata.set(CONTENT_TYPE, type.toString());
+ }
+ }
} catch (CompressorException e) {
if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
throw new TikaMemoryLimitException(e.getMessage());
@@ -189,10 +243,6 @@ public class CompressorParser extends AbstractParser {
throw new TikaException("Unable to uncompress document stream", e);
}
- MediaType type = getMediaType(cis);
- if (!type.equals(MediaType.OCTET_STREAM)) {
- metadata.set(CONTENT_TYPE, type.toString());
- }
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
@@ -215,6 +265,8 @@ public class CompressorParser extends AbstractParser {
name = name.substring(0, name.length() - 5);
} else if (name.endsWith(".pack")) {
name = name.substring(0, name.length() - 5);
+ } else if (name.endsWith(".br")) {
+ name = name.substring(0, name.length() - 3);
} else if (name.length() > 0) {
name = GzipUtils.getUncompressedFilename(name);
}
@@ -234,6 +286,20 @@ public class CompressorParser extends AbstractParser {
xhtml.endDocument();
}
+ /**
+ * @param metadata
+ * @return CompressorStream name based on the content-type value
+ * in metadata or <code>null</code> if not found
+ * ind
+ */
+ private String getStreamName(Metadata metadata) {
+ String mimeString = metadata.get(Metadata.CONTENT_TYPE);
+ if (mimeString == null) {
+ return null;
+ }
+ return MIMES_TO_NAME.get(mimeString);
+ }
+
@Field
public void setMemoryLimitInKb(int memoryLimitInKb) {
this.memoryLimitInKb = memoryLimitInKb;
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
index 26552eb..9a1d579 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
@@ -21,14 +21,25 @@ package org.apache.tika.parser.pkg;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
+import java.io.BufferedWriter;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
import org.junit.BeforeClass;
import org.junit.Test;
@@ -39,7 +50,6 @@ public class CompressorParserTest extends TikaTest {
@BeforeClass
public static void setUp() {
- NOT_COVERED.add(MediaType.application("x-brotli"));
NOT_COVERED.add(MediaType.application("x-lz4-block"));
NOT_COVERED.add(MediaType.application("x-snappy-raw"));
NOT_COVERED.add(MediaType.application("deflate64"));
@@ -68,6 +78,16 @@ public class CompressorParserTest extends TikaTest {
}
@Test
+ public void testBrotli() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testBROTLI_compressed.br");
+ List<Metadata> metadataList = getRecursiveMetadata("testBROTLI_compressed.br", metadata);
+
+ assertContains("XXXXXXXXXXYYYYYYYYYY", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
+ assertEquals("testBROTLI_compressed", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ }
+
+ @Test
public void testCoverage() throws Exception {
//test that the package parser covers all inputstreams handled
//by CompressorStreamFactory. When we update commons-compress, and they add
diff --git a/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br b/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br
new file mode 100644
index 0000000..3769516
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br differ
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.