You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/29 19:15:24 UTC

[tika] 04/04: TIKA-2621 -- add support for brotli

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 04225d2834104c973e6cff421c283af876b2e398
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Mar 29 13:49:59 2018 -0400

    TIKA-2621 -- add support for brotli
---
 .../org/apache/tika/mime/tika-mimetypes.xml        |  14 ++++
 .../src/test/java/org/apache/tika/TikaTest.java    |  14 ++++
 tika-parsers/pom.xml                               |   7 ++
 .../apache/tika/parser/pkg/CompressorParser.java   |  82 +++++++++++++++++++--
 .../tika/parser/pkg/CompressorParserTest.java      |  22 +++++-
 .../test-documents/testBROTLI_compressed.br        | Bin 0 -> 12 bytes
 6 files changed, 130 insertions(+), 9 deletions(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 346eb73..634d9d1 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -3135,6 +3135,20 @@
       <match value="bplist" type="string" offset="0"/>
     </magic>
   </mime-type>
+  <mime-type type="application/x-gtar">
+    <_comment>GNU tar Compressed File Archive (GNU Tape Archive)</_comment>
+    <magic priority="50">
+      <!-- GNU tar archive -->
+      <match value="ustar  \0" type="string" offset="257" />
+    </magic>
+    <glob pattern="*.gtar"/>
+    <sub-class-of type="application/x-tar"/>
+  </mime-type>
+
+  <mime-type type="application/x-brotli">
+    <glob pattern="*.br" />
+    <glob pattern="*.brotli" />
+  </mime-type>
 
   <mime-type type="application/x-bzip">
     <magic priority="40">
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 153a564..9c827f7 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -213,6 +213,20 @@ public abstract class TikaTest {
         return getRecursiveMetadata(filePath, new ParseContext());
     }
 
+    protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata) throws Exception {
+        return getRecursiveMetadata(filePath, new ParseContext(), metadata);
+    }
+
+    protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context, Metadata metadata) throws Exception {
+        Parser p = new AutoDetectParser();
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+        try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
+            wrapper.parse(is, new DefaultHandler(), metadata, context);
+        }
+        return wrapper.getMetadata();
+    }
+
     protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
         Parser p = new AutoDetectParser();
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p,
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index a3e9e4d..e6c7720 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -40,6 +40,8 @@
     <codec.version>1.10</codec.version>
     <!-- NOTE: sync tukaani version with commons-compress in tika-parent-->
     <tukaani.version>1.8</tukaani.version>
+    <!-- NOTE: sync brotli version with commons-compress in tika-parent-->
+    <brotli.version>0.1.2</brotli.version>
     <mime4j.version>0.8.1</mime4j.version>
     <vorbis.version>0.8</vorbis.version>
     <pdfbox.version>2.0.9</pdfbox.version>
@@ -151,6 +153,11 @@
       <version>${tukaani.version}</version>
     </dependency>
     <dependency>
+      <groupId>org.brotli</groupId>
+      <artifactId>dec</artifactId>
+      <version>${brotli.version}</version>
+    </dependency>
+    <dependency>
       <groupId>com.github.luben</groupId>
       <artifactId>zstd-jni</artifactId>
       <version>1.3.3-3</version>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index ada7ec9..658d04c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -21,6 +21,10 @@ import static org.apache.tika.metadata.HttpHeaders.CONTENT_TYPE;
 import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.commons.compress.MemoryLimitException;
@@ -78,9 +82,47 @@ public class CompressorParser extends AbstractParser {
     private static final MediaType ZSTD = MediaType.application("zstd");
     private static final MediaType DEFLATE64= MediaType.application("deflate64");
 
-    private static final Set<MediaType> SUPPORTED_TYPES =
-            MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
-                    XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA, ZSTD);
+    private static Set<MediaType> SUPPORTED_TYPES;
+    private static Map<String, String> MIMES_TO_NAME;
+
+    static {
+        Set<MediaType> TMP_SET = new HashSet<>();
+        TMP_SET.addAll(
+                MediaType.set(BZIP, BZIP2, DEFLATE64, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
+                        XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA));
+        try {
+            Class.forName("org.brotli.dec.BrotliInputStream");
+            TMP_SET.add(BROTLI);
+        } catch (NoClassDefFoundError|ClassNotFoundException e) {
+            //swallow
+        }
+        try {
+            Class.forName("com.github.luben.zstd.ZstdInputStream");
+            TMP_SET.add(ZSTD);
+        } catch (NoClassDefFoundError|ClassNotFoundException e) {
+            //swallow
+        }
+        SUPPORTED_TYPES = Collections.unmodifiableSet(TMP_SET);
+    }
+
+    static {
+        //map the mime type strings to the compressor stream names
+        Map<String, String> tmpMimesToName = new HashMap<>();
+        tmpMimesToName.put(BZIP2.toString(), CompressorStreamFactory.BZIP2);
+        tmpMimesToName.put(GZIP.toString(), CompressorStreamFactory.GZIP);
+        tmpMimesToName.put(LZ4_FRAMED.toString(), CompressorStreamFactory.LZ4_FRAMED);
+        tmpMimesToName.put(LZ4_BLOCK.toString(), CompressorStreamFactory.LZ4_BLOCK);
+        tmpMimesToName.put(XZ.toString(), CompressorStreamFactory.XZ);
+        tmpMimesToName.put(PACK.toString(), CompressorStreamFactory.PACK200);
+        tmpMimesToName.put(SNAPPY_FRAMED.toString(), CompressorStreamFactory.SNAPPY_FRAMED);
+        tmpMimesToName.put(ZLIB.toString(), CompressorStreamFactory.DEFLATE);
+        tmpMimesToName.put(COMPRESS.toString(), CompressorStreamFactory.Z);
+        tmpMimesToName.put(LZMA.toString(), CompressorStreamFactory.LZMA);
+        tmpMimesToName.put(BROTLI.toString(), CompressorStreamFactory.BROTLI);
+        tmpMimesToName.put(ZSTD.toString(), CompressorStreamFactory.ZSTANDARD);
+        MIMES_TO_NAME = Collections.unmodifiableMap(tmpMimesToName);
+    }
+
 
     private int memoryLimitInKb = 100000;//100MB
 
@@ -181,7 +223,19 @@ public class CompressorParser extends AbstractParser {
                  });
             CompressorStreamFactory factory =
                     new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
-            cis = factory.createCompressorInputStream(stream);
+            //if we've already identified it via autodetect
+            //trust that and go with the appropriate name
+            //to avoid calling CompressorStreamFactory.detect() twice
+            String name = getStreamName(metadata);
+            if (name != null) {
+                cis = factory.createCompressorInputStream(name, stream);
+            } else {
+                cis = factory.createCompressorInputStream(stream);
+                MediaType type = getMediaType(cis);
+                if (!type.equals(MediaType.OCTET_STREAM)) {
+                    metadata.set(CONTENT_TYPE, type.toString());
+                }
+            }
         } catch (CompressorException e) {
             if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
                 throw new TikaMemoryLimitException(e.getMessage());
@@ -189,10 +243,6 @@ public class CompressorParser extends AbstractParser {
             throw new TikaException("Unable to uncompress document stream", e);
         }
 
-        MediaType type = getMediaType(cis);
-        if (!type.equals(MediaType.OCTET_STREAM)) {
-            metadata.set(CONTENT_TYPE, type.toString());
-        }
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
@@ -215,6 +265,8 @@ public class CompressorParser extends AbstractParser {
                     name = name.substring(0, name.length() - 5);
                 } else if (name.endsWith(".pack")) {
                     name = name.substring(0, name.length() - 5);
+                } else if (name.endsWith(".br")) {
+                    name = name.substring(0, name.length() - 3);
                 } else if (name.length() > 0) {
                     name = GzipUtils.getUncompressedFilename(name);
                 }
@@ -234,6 +286,20 @@ public class CompressorParser extends AbstractParser {
         xhtml.endDocument();
     }
 
+    /**
+     * @param metadata
+     * @return CompressorStream name based on the content-type value
+     * in metadata or <code>null</code> if not found
+     *  ind
+     */
+    private String getStreamName(Metadata metadata) {
+        String mimeString = metadata.get(Metadata.CONTENT_TYPE);
+        if (mimeString == null) {
+            return null;
+        }
+        return MIMES_TO_NAME.get(mimeString);
+    }
+
     @Field
     public void setMemoryLimitInKb(int memoryLimitInKb) {
         this.memoryLimitInKb = memoryLimitInKb;
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
index 26552eb..9a1d579 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
@@ -21,14 +21,25 @@ package org.apache.tika.parser.pkg;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
 
+import java.io.BufferedWriter;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
 import org.apache.commons.compress.compressors.CompressorStreamFactory;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.RecursiveParserWrapper;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -39,7 +50,6 @@ public class CompressorParserTest extends TikaTest {
 
     @BeforeClass
     public static void setUp() {
-        NOT_COVERED.add(MediaType.application("x-brotli"));
         NOT_COVERED.add(MediaType.application("x-lz4-block"));
         NOT_COVERED.add(MediaType.application("x-snappy-raw"));
         NOT_COVERED.add(MediaType.application("deflate64"));
@@ -68,6 +78,16 @@ public class CompressorParserTest extends TikaTest {
     }
 
     @Test
+    public void testBrotli() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testBROTLI_compressed.br");
+        List<Metadata> metadataList = getRecursiveMetadata("testBROTLI_compressed.br", metadata);
+
+        assertContains("XXXXXXXXXXYYYYYYYYYY", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertEquals("testBROTLI_compressed", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+    }
+
+    @Test
     public void testCoverage() throws Exception {
         //test that the package parser covers all inputstreams handled
         //by CompressorStreamFactory.  When we update commons-compress, and they add
diff --git a/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br b/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br
new file mode 100644
index 0000000..3769516
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testBROTLI_compressed.br differ

-- 
To stop receiving notification emails like this one, please contact
tallison@apache.org.