You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/06/01 19:16:32 UTC
[tika] 05/05: TIKA-2341 -- upgrade commons-compress to 1.14,
added capabilities for snappy and lz4-framed
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 77900ab626a2a05700cadf46f090966295c29149
Author: tballison <ta...@mitre.org>
AuthorDate: Thu Jun 1 15:16:09 2017 -0400
TIKA-2341 -- upgrade commons-compress to 1.14, added capabilities for snappy and lz4-framed
---
LICENSE.txt | 22 +
tika-bundle/pom.xml | 1 +
tika-parent/pom.xml | 2 +-
.../apache/tika/parser/pkg/CompressorParser.java | 56 +-
.../org/apache/tika/parser/pkg/PackageParser.java | 16 +-
.../tika/parser/pkg/TikaArchiveStreamFactory.java | 565 ---------------------
.../parser/pkg/TikaCompressorStreamFactory.java | 551 --------------------
.../tika/parser/pkg/ZipContainerDetector.java | 6 +-
.../tika/parser/pkg/CompressorParserTest.java | 30 +-
.../apache/tika/parser/pkg/PackageParserTest.java | 3 +-
.../resources/test-documents/testLZ4-framed.lz4 | Bin 0 -> 1443 bytes
.../resources/test-documents/testSnappy-framed.sz | Bin 0 -> 58586 bytes
12 files changed, 105 insertions(+), 1147 deletions(-)
diff --git a/LICENSE.txt b/LICENSE.txt
index e3cd6ff..b9b6fcf 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -386,3 +386,25 @@ H2 Database in tika-eval
(Mozilla Public License) or under the EPL 1.0 (Eclipse Public License).
An original copy of the license agreement can be found at:
http://www.h2database.com/html/license.html
+
+org.brotli.dec dependency of commons-compress (MIT License)
+
+Copyright (c) 2009, 2010, 2013-2016 by the Brotli Authors.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
\ No newline at end of file
diff --git a/tika-bundle/pom.xml b/tika-bundle/pom.xml
index 5f70dcb..b67d1b5 100644
--- a/tika-bundle/pom.xml
+++ b/tika-bundle/pom.xml
@@ -239,6 +239,7 @@
org.bouncycastle.operator;resolution:=optional,
org.bouncycastle.operator.bc;resolution:=optional,
org.bouncycastle.tsp;resolution:=optional,
+ org.brotli.dec;resolution:=optional,
org.cyberneko.html.xercesbridge;resolution:=optional,
org.etsi.uri.x01903.v14;resolution:=optional,
org.ibex.nestedvm;resolution:=optional,
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 525b26d..dfb8671 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -306,7 +306,7 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>${project.build.sourceEncoding}</project.reporting.outputEncoding>
<!-- NOTE: sync tukaani version with commons-compress in tika-parsers -->
- <commons.compress.version>1.13</commons.compress.version>
+ <commons.compress.version>1.14</commons.compress.version>
<commons.io.version>2.5</commons.io.version>
<slf4j.version>1.7.24</slf4j.version>
</properties>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index ff589e0..48f8bec 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -23,8 +23,10 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
+import org.apache.commons.compress.MemoryLimitException;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorInputStream;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
@@ -57,6 +59,11 @@ public class CompressorParser extends AbstractParser {
/** Serial version UID */
private static final long serialVersionUID = 2793565792967222459L;
+ private static final MediaType BROTLI = MediaType.application("x-brotli");
+ private static final MediaType LZ4_BLOCK = MediaType.application("x-lz4-block");
+ private static final MediaType SNAPPY_RAW = MediaType.application("x-snappy-raw");
+
+
private static final MediaType BZIP = MediaType.application("x-bzip");
private static final MediaType BZIP2 = MediaType.application("x-bzip2");
private static final MediaType GZIP = MediaType.application("gzip");
@@ -64,15 +71,24 @@ public class CompressorParser extends AbstractParser {
private static final MediaType COMPRESS = MediaType.application("x-compress");
private static final MediaType XZ = MediaType.application("x-xz");
private static final MediaType PACK = MediaType.application("x-java-pack200");
- private static final MediaType SNAPPY = MediaType.application("x-snappy-framed");
+ private static final MediaType SNAPPY_FRAMED = MediaType.application("x-snappy");
private static final MediaType ZLIB = MediaType.application("zlib");
private static final MediaType LZMA = MediaType.application("x-lzma");
+ private static final MediaType LZ4_FRAMED = MediaType.application("x-lz4");
private static final Set<MediaType> SUPPORTED_TYPES =
- MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, COMPRESS, XZ, PACK, ZLIB, LZMA);
+ MediaType.set(BZIP, BZIP2, GZIP, GZIP_ALT, LZ4_FRAMED, COMPRESS,
+ XZ, PACK, SNAPPY_FRAMED, ZLIB, LZMA);
private int memoryLimitInKb = 100000;//100MB
+ /**
+ *
+ * @deprecated use {@link #getMediaType(String)}
+ * @param stream stream
+ * @return MediaType
+ */
+ @Deprecated
static MediaType getMediaType(CompressorInputStream stream) {
// TODO Add support for the remaining CompressorInputStream formats:
// LZ4
@@ -92,7 +108,7 @@ public class CompressorParser extends AbstractParser {
} else if (stream instanceof FramedSnappyCompressorInputStream ||
stream instanceof SnappyCompressorInputStream) {
// TODO Add unit tests for this format
- return SNAPPY;
+ return SNAPPY_FRAMED;
} else if (stream instanceof LZMACompressorInputStream) {
return LZMA;
} else {
@@ -101,26 +117,34 @@ public class CompressorParser extends AbstractParser {
}
static MediaType getMediaType(String name) {
- if (TikaCompressorStreamFactory.BZIP2.equals(name)) {
+ if (CompressorStreamFactory.BROTLI.equals(name)) {
+ return BROTLI;
+ } else if (CompressorStreamFactory.LZ4_BLOCK.equals(name)) {
+ return LZ4_BLOCK;
+ } else if (CompressorStreamFactory.LZ4_FRAMED.equals(name)) {
+ return LZ4_FRAMED;
+ } else if (CompressorStreamFactory.BZIP2.equals(name)) {
return BZIP2;
- } else if (TikaCompressorStreamFactory.GZIP.equals(name)) {
+ } else if (CompressorStreamFactory.GZIP.equals(name)) {
return GZIP;
- } else if (TikaCompressorStreamFactory.XZ.equals(name)) {
+ } else if (CompressorStreamFactory.XZ.equals(name)) {
return XZ;
- } else if (TikaCompressorStreamFactory.DEFLATE.equals(name)) {
+ } else if (CompressorStreamFactory.DEFLATE.equals(name)) {
return ZLIB;
- } else if (TikaCompressorStreamFactory.Z.equals(name)) {
+ } else if (CompressorStreamFactory.Z.equals(name)) {
return COMPRESS;
- } else if (TikaCompressorStreamFactory.PACK200.equals(name)) {
+ } else if (CompressorStreamFactory.PACK200.equals(name)) {
return PACK;
- } else if (TikaCompressorStreamFactory.SNAPPY_FRAMED.equals(name) ||
- TikaCompressorStreamFactory.SNAPPY_RAW.equals(name)) {
- return SNAPPY;
- } else if (TikaCompressorStreamFactory.LZMA.equals(name)) {
+ } else if (CompressorStreamFactory.SNAPPY_FRAMED.equals(name)) {
+ return SNAPPY_FRAMED;
+ } else if (CompressorStreamFactory.SNAPPY_RAW.equals(name)) {
+ return SNAPPY_RAW;
+ } else if (CompressorStreamFactory.LZMA.equals(name)) {
return LZMA;
} else {
return MediaType.OCTET_STREAM;
}
+
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -149,11 +173,11 @@ public class CompressorParser extends AbstractParser {
return false;
}
});
- TikaCompressorStreamFactory factory =
- new TikaCompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
+ CompressorStreamFactory factory =
+ new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
cis = factory.createCompressorInputStream(stream);
} catch (CompressorException e) {
- if (e.getMessage() != null && e.getMessage().startsWith("MemoryLimitException:")) {
+ if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
throw new TikaMemoryLimitException(e.getMessage());
}
throw new TikaException("Unable to uncompress document stream", e);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 119c2e6..d8341af 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -116,21 +116,21 @@ public class PackageParser extends AbstractParser {
}
static MediaType getMediaType(String name) {
- if (TikaArchiveStreamFactory.JAR.equals(name)) {
+ if (ArchiveStreamFactory.JAR.equals(name)) {
return JAR;
- } else if (TikaArchiveStreamFactory.ZIP.equals(name)) {
+ } else if (ArchiveStreamFactory.ZIP.equals(name)) {
return ZIP;
- } else if (TikaArchiveStreamFactory.AR.equals(name)) {
+ } else if (ArchiveStreamFactory.AR.equals(name)) {
return AR;
- } else if (TikaArchiveStreamFactory.ARJ.equals(name)) {
+ } else if (ArchiveStreamFactory.ARJ.equals(name)) {
return ARJ;
- } else if (TikaArchiveStreamFactory.CPIO.equals(name)) {
+ } else if (ArchiveStreamFactory.CPIO.equals(name)) {
return CPIO;
- } else if (TikaArchiveStreamFactory.DUMP.equals(name)) {
+ } else if (ArchiveStreamFactory.DUMP.equals(name)) {
return DUMP;
- } else if (TikaArchiveStreamFactory.TAR.equals(name)) {
+ } else if (ArchiveStreamFactory.TAR.equals(name)) {
return TAR;
- } else if (TikaArchiveStreamFactory.SEVEN_Z.equals(name)) {
+ } else if (ArchiveStreamFactory.SEVEN_Z.equals(name)) {
return SEVENZ;
} else {
return MediaType.OCTET_STREAM;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaArchiveStreamFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaArchiveStreamFactory.java
deleted file mode 100644
index c4b534f..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaArchiveStreamFactory.java
+++ /dev/null
@@ -1,565 +0,0 @@
-package org.apache.tika.parser.pkg;
- /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.security.AccessController;
-import java.security.PrivilegedAction;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.Set;
-import java.util.SortedMap;
-import java.util.TreeMap;
-
-import org.apache.commons.compress.archivers.ArchiveException;
-import org.apache.commons.compress.archivers.ArchiveInputStream;
-import org.apache.commons.compress.archivers.ArchiveOutputStream;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.commons.compress.archivers.ArchiveStreamProvider;
-import org.apache.commons.compress.archivers.StreamingNotSupportedException;
-import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
-import org.apache.commons.compress.archivers.ar.ArArchiveOutputStream;
-import org.apache.commons.compress.archivers.arj.ArjArchiveInputStream;
-import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
-import org.apache.commons.compress.archivers.cpio.CpioArchiveOutputStream;
-import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
-import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
-import org.apache.commons.compress.archivers.jar.JarArchiveOutputStream;
-import org.apache.commons.compress.archivers.sevenz.SevenZFile;
-import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
-import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
-import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
-import org.apache.commons.compress.utils.IOUtils;
-import org.apache.commons.compress.utils.Lists;
-import org.apache.commons.compress.utils.ServiceLoaderIterator;
-import org.apache.commons.compress.utils.Sets;
-
-/**
- * This is a temporary copy/paste hack from commons-compress for Tika 1.15
- * that 1) allows detection without initialization of a stream and
- * 2) prevents easily preventable OOM on three file formats.
- *
- * Once commons-compress 1.14 is released, we will delete this class
- * and go back to commons-compress's CompressorStreamFactory.
- */
-@Deprecated
-class TikaArchiveStreamFactory implements ArchiveStreamProvider {
-
- private static final int TAR_HEADER_SIZE = 512;
-
- private static final int DUMP_SIGNATURE_SIZE = 32;
-
- private static final int SIGNATURE_SIZE = 12;
-
- private static final ArchiveStreamFactory SINGLETON = new ArchiveStreamFactory();
-
- /**
- * Constant (value {@value}) used to identify the AR archive format.
- * @since 1.1
- */
- public static final String AR = "ar";
-
- /**
- * Constant (value {@value}) used to identify the ARJ archive format.
- * Not supported as an output stream type.
- * @since 1.6
- */
- public static final String ARJ = "arj";
-
- /**
- * Constant (value {@value}) used to identify the CPIO archive format.
- * @since 1.1
- */
- public static final String CPIO = "cpio";
-
- /**
- * Constant (value {@value}) used to identify the Unix DUMP archive format.
- * Not supported as an output stream type.
- * @since 1.3
- */
- public static final String DUMP = "dump";
-
- /**
- * Constant (value {@value}) used to identify the JAR archive format.
- * @since 1.1
- */
- public static final String JAR = "jar";
-
- /**
- * Constant used to identify the TAR archive format.
- * @since 1.1
- */
- public static final String TAR = "tar";
-
- /**
- * Constant (value {@value}) used to identify the ZIP archive format.
- * @since 1.1
- */
- public static final String ZIP = "zip";
-
- /**
- * Constant (value {@value}) used to identify the 7z archive format.
- * @since 1.8
- */
- public static final String SEVEN_Z = "7z";
-
- /**
- * Entry encoding, null for the platform default.
- */
- private final String encoding;
-
- /**
- * Entry encoding, null for the default.
- */
- private volatile String entryEncoding;
-
- private SortedMap<String, ArchiveStreamProvider> archiveInputStreamProviders;
-
- private SortedMap<String, ArchiveStreamProvider> archiveOutputStreamProviders;
-
- private static ArrayList<ArchiveStreamProvider> findArchiveStreamProviders() {
- return Lists.newArrayList(serviceLoaderIterator());
- }
-
- static void putAll(Set<String> names, ArchiveStreamProvider provider,
- TreeMap<String, ArchiveStreamProvider> map) {
- for (String name : names) {
- map.put(toKey(name), provider);
- }
- }
-
- private static Iterator<ArchiveStreamProvider> serviceLoaderIterator() {
- return new ServiceLoaderIterator<>(ArchiveStreamProvider.class);
- }
-
- private static String toKey(final String name) {
- return name.toUpperCase(Locale.ROOT);
- }
-
- /**
- * Constructs a new sorted map from input stream provider names to provider
- * objects.
- *
- * <p>
- * The map returned by this method will have one entry for each provider for
- * which support is available in the current Java virtual machine. If two or
- * more supported provider have the same name then the resulting map will
- * contain just one of them; which one it will contain is not specified.
- * </p>
- *
- * <p>
- * The invocation of this method, and the subsequent use of the resulting
- * map, may cause time-consuming disk or network I/O operations to occur.
- * This method is provided for applications that need to enumerate all of
- * the available providers, for example to allow user provider selection.
- * </p>
- *
- * <p>
- * This method may return different results at different times if new
- * providers are dynamically made available to the current Java virtual
- * machine.
- * </p>
- *
- * @return An immutable, map from names to provider objects
- * @since 1.13
- */
- public static SortedMap<String, ArchiveStreamProvider> findAvailableArchiveInputStreamProviders() {
- return AccessController.doPrivileged(new PrivilegedAction<SortedMap<String, ArchiveStreamProvider>>() {
- @Override
- public SortedMap<String, ArchiveStreamProvider> run() {
- TreeMap<String, ArchiveStreamProvider> map = new TreeMap<>();
- putAll(SINGLETON.getInputStreamArchiveNames(), SINGLETON, map);
- for (ArchiveStreamProvider provider : findArchiveStreamProviders()) {
- putAll(provider.getInputStreamArchiveNames(), provider, map);
- }
- return map;
- }
- });
- }
-
- /**
- * Constructs a new sorted map from output stream provider names to provider
- * objects.
- *
- * <p>
- * The map returned by this method will have one entry for each provider for
- * which support is available in the current Java virtual machine. If two or
- * more supported provider have the same name then the resulting map will
- * contain just one of them; which one it will contain is not specified.
- * </p>
- *
- * <p>
- * The invocation of this method, and the subsequent use of the resulting
- * map, may cause time-consuming disk or network I/O operations to occur.
- * This method is provided for applications that need to enumerate all of
- * the available providers, for example to allow user provider selection.
- * </p>
- *
- * <p>
- * This method may return different results at different times if new
- * providers are dynamically made available to the current Java virtual
- * machine.
- * </p>
- *
- * @return An immutable, map from names to provider objects
- * @since 1.13
- */
- public static SortedMap<String, ArchiveStreamProvider> findAvailableArchiveOutputStreamProviders() {
- return AccessController.doPrivileged(new PrivilegedAction<SortedMap<String, ArchiveStreamProvider>>() {
- @Override
- public SortedMap<String, ArchiveStreamProvider> run() {
- TreeMap<String, ArchiveStreamProvider> map = new TreeMap<>();
- putAll(SINGLETON.getOutputStreamArchiveNames(), SINGLETON, map);
- for (ArchiveStreamProvider provider : findArchiveStreamProviders()) {
- putAll(provider.getOutputStreamArchiveNames(), provider, map);
- }
- return map;
- }
- });
- }
-
-
- /**
- * Create an instance using the specified encoding.
- *
- * @param encoding the encoding to be used.
- *
- * @since 1.10
- */
- public TikaArchiveStreamFactory(final String encoding) {
- super();
- this.encoding = encoding;
- // Also set the original field so can continue to use it.
- this.entryEncoding = encoding;
- }
-
- /**
- * Returns the encoding to use for arj, jar, zip, dump, cpio and tar
- * files, or null for the archiver default.
- *
- * @return entry encoding, or null for the archiver default
- * @since 1.5
- */
- public String getEntryEncoding() {
- return entryEncoding;
- }
-
- /**
- * Sets the encoding to use for arj, jar, zip, dump, cpio and tar files. Use null for the archiver default.
- *
- * @param entryEncoding the entry encoding, null uses the archiver default.
- * @since 1.5
- * @deprecated 1.10 use {@link #TikaArchiveStreamFactory(String)} to specify the encoding
- * @throws IllegalStateException if the constructor {@link #TikaArchiveStreamFactory(String)}
- * was used to specify the factory encoding.
- */
- @Deprecated
- public void setEntryEncoding(final String entryEncoding) {
- // Note: this does not detect new ArchiveStreamFactory(null) but that does not set the encoding anyway
- if (encoding != null) {
- throw new IllegalStateException("Cannot overide encoding set by the constructor");
- }
- this.entryEncoding = entryEncoding;
- }
-
- /**
- * Creates an archive input stream from an archiver name and an input stream.
- *
- * @param archiverName the archive name,
- * i.e. {@value #AR}, {@value #ARJ}, {@value #ZIP}, {@value #TAR}, {@value #JAR}, {@value #CPIO}, {@value #DUMP} or {@value #SEVEN_Z}
- * @param in the input stream
- * @return the archive input stream
- * @throws ArchiveException if the archiver name is not known
- * @throws StreamingNotSupportedException if the format cannot be
- * read from a stream
- * @throws IllegalArgumentException if the archiver name or stream is null
- */
- public ArchiveInputStream createArchiveInputStream(final String archiverName, final InputStream in)
- throws ArchiveException {
- return createArchiveInputStream(archiverName, in, entryEncoding);
- }
-
- @Override
- public ArchiveInputStream createArchiveInputStream(final String archiverName, final InputStream in,
- final String actualEncoding) throws ArchiveException {
-
- if (archiverName == null) {
- throw new IllegalArgumentException("Archivername must not be null.");
- }
-
- if (in == null) {
- throw new IllegalArgumentException("InputStream must not be null.");
- }
-
- if (AR.equalsIgnoreCase(archiverName)) {
- return new ArArchiveInputStream(in);
- }
- if (ARJ.equalsIgnoreCase(archiverName)) {
- if (actualEncoding != null) {
- return new ArjArchiveInputStream(in, actualEncoding);
- }
- return new ArjArchiveInputStream(in);
- }
- if (ZIP.equalsIgnoreCase(archiverName)) {
- if (actualEncoding != null) {
- return new ZipArchiveInputStream(in, actualEncoding);
- }
- return new ZipArchiveInputStream(in);
- }
- if (TAR.equalsIgnoreCase(archiverName)) {
- if (actualEncoding != null) {
- return new TarArchiveInputStream(in, actualEncoding);
- }
- return new TarArchiveInputStream(in);
- }
- if (JAR.equalsIgnoreCase(archiverName)) {
- if (actualEncoding != null) {
- return new JarArchiveInputStream(in, actualEncoding);
- }
- return new JarArchiveInputStream(in);
- }
- if (CPIO.equalsIgnoreCase(archiverName)) {
- if (actualEncoding != null) {
- return new CpioArchiveInputStream(in, actualEncoding);
- }
- return new CpioArchiveInputStream(in);
- }
- if (DUMP.equalsIgnoreCase(archiverName)) {
- if (actualEncoding != null) {
- return new DumpArchiveInputStream(in, actualEncoding);
- }
- return new DumpArchiveInputStream(in);
- }
- if (SEVEN_Z.equalsIgnoreCase(archiverName)) {
- throw new StreamingNotSupportedException(SEVEN_Z);
- }
-
- final ArchiveStreamProvider archiveStreamProvider = getArchiveInputStreamProviders().get(toKey(archiverName));
- if (archiveStreamProvider != null) {
- return archiveStreamProvider.createArchiveInputStream(archiverName, in, actualEncoding);
- }
-
- throw new ArchiveException("Archiver: " + archiverName + " not found.");
- }
-
- /**
- * Creates an archive output stream from an archiver name and an output stream.
- *
- * @param archiverName the archive name,
- * i.e. {@value #AR}, {@value #ZIP}, {@value #TAR}, {@value #JAR} or {@value #CPIO}
- * @param out the output stream
- * @return the archive output stream
- * @throws ArchiveException if the archiver name is not known
- * @throws StreamingNotSupportedException if the format cannot be
- * written to a stream
- * @throws IllegalArgumentException if the archiver name or stream is null
- */
- public ArchiveOutputStream createArchiveOutputStream(final String archiverName, final OutputStream out)
- throws ArchiveException {
- return createArchiveOutputStream(archiverName, out, entryEncoding);
- }
-
- @Override
- public ArchiveOutputStream createArchiveOutputStream(
- final String archiverName, final OutputStream out, final String actualEncoding)
- throws ArchiveException {
- if (archiverName == null) {
- throw new IllegalArgumentException("Archivername must not be null.");
- }
- if (out == null) {
- throw new IllegalArgumentException("OutputStream must not be null.");
- }
-
- if (AR.equalsIgnoreCase(archiverName)) {
- return new ArArchiveOutputStream(out);
- }
- if (ZIP.equalsIgnoreCase(archiverName)) {
- final ZipArchiveOutputStream zip = new ZipArchiveOutputStream(out);
- if (actualEncoding != null) {
- zip.setEncoding(actualEncoding);
- }
- return zip;
- }
- if (TAR.equalsIgnoreCase(archiverName)) {
- if (actualEncoding != null) {
- return new TarArchiveOutputStream(out, actualEncoding);
- }
- return new TarArchiveOutputStream(out);
- }
- if (JAR.equalsIgnoreCase(archiverName)) {
- if (actualEncoding != null) {
- return new JarArchiveOutputStream(out, actualEncoding);
- }
- return new JarArchiveOutputStream(out);
- }
- if (CPIO.equalsIgnoreCase(archiverName)) {
- if (actualEncoding != null) {
- return new CpioArchiveOutputStream(out, actualEncoding);
- }
- return new CpioArchiveOutputStream(out);
- }
- if (SEVEN_Z.equalsIgnoreCase(archiverName)) {
- throw new StreamingNotSupportedException(SEVEN_Z);
- }
-
- final ArchiveStreamProvider archiveStreamProvider = getArchiveOutputStreamProviders().get(toKey(archiverName));
- if (archiveStreamProvider != null) {
- return archiveStreamProvider.createArchiveOutputStream(archiverName, out, actualEncoding);
- }
-
- throw new ArchiveException("Archiver: " + archiverName + " not found.");
- }
-
- /**
- * Create an archive input stream from an input stream, autodetecting
- * the archive type from the first few bytes of the stream. The InputStream
- * must support marks, like BufferedInputStream.
- *
- * @param in the input stream
- * @return the archive input stream
- * @throws ArchiveException if the archiver name is not known
- * @throws StreamingNotSupportedException if the format cannot be
- * read from a stream
- * @throws IllegalArgumentException if the stream is null or does not support mark
- */
- public ArchiveInputStream createArchiveInputStream(final InputStream in)
- throws ArchiveException {
- return createArchiveInputStream(detect(in), in);
- }
-
- /**
- * Try to determine the type of Archiver
- * @param in input stream
- * @return type of archiver if found
- * @throws ArchiveException if an archiver cannot be detected in the stream
- * @since 1.14
- */
- public static String detect(InputStream in) throws ArchiveException {
- if (in == null) {
- throw new IllegalArgumentException("Stream must not be null.");
- }
-
- if (!in.markSupported()) {
- throw new IllegalArgumentException("Mark is not supported.");
- }
-
- final byte[] signature = new byte[SIGNATURE_SIZE];
- in.mark(signature.length);
- int signatureLength = -1;
- try {
- signatureLength = IOUtils.readFully(in, signature);
- in.reset();
- } catch (IOException e) {
- throw new ArchiveException("IOException while reading signature.");
- }
-
- if (ZipArchiveInputStream.matches(signature, signatureLength)) {
- return ZIP;
- } else if (JarArchiveInputStream.matches(signature, signatureLength)) {
- return JAR;
- } if (ArArchiveInputStream.matches(signature, signatureLength)) {
- return AR;
- } else if (CpioArchiveInputStream.matches(signature, signatureLength)) {
- return CPIO;
- } else if (ArjArchiveInputStream.matches(signature, signatureLength)) {
- return ARJ;
- } else if (SevenZFile.matches(signature, signatureLength)) {
- return SEVEN_Z;
- }
-
- // Dump needs a bigger buffer to check the signature;
- final byte[] dumpsig = new byte[DUMP_SIGNATURE_SIZE];
- in.mark(dumpsig.length);
- try {
- signatureLength = IOUtils.readFully(in, dumpsig);
- in.reset();
- } catch (IOException e) {
- throw new ArchiveException("IOException while reading dump signature");
- }
- if (DumpArchiveInputStream.matches(dumpsig, signatureLength)) {
- return DUMP;
- }
-
- // Tar needs an even bigger buffer to check the signature; read the first block
- final byte[] tarHeader = new byte[TAR_HEADER_SIZE];
- in.mark(tarHeader.length);
- try {
- signatureLength = IOUtils.readFully(in, tarHeader);
- in.reset();
- } catch (IOException e) {
- throw new ArchiveException("IOException while reading tar signature");
- }
- if (TarArchiveInputStream.matches(tarHeader, signatureLength)) {
- return TAR;
- }
-
- // COMPRESS-117 - improve auto-recognition
- if (signatureLength >= TAR_HEADER_SIZE) {
- TarArchiveInputStream tais = null;
- try {
- tais = new TarArchiveInputStream(new ByteArrayInputStream(tarHeader));
- // COMPRESS-191 - verify the header checksum
- if (tais.getNextTarEntry().isCheckSumOK()) {
- return TAR;
- }
- } catch (final Exception e) { // NOPMD
- // can generate IllegalArgumentException as well
- // as IOException
- // autodetection, simply not a TAR
- // ignored
- } finally {
- IOUtils.closeQuietly(tais);
- }
- }
- throw new ArchiveException("No Archiver found for the stream signature");
- }
-
- public SortedMap<String, ArchiveStreamProvider> getArchiveInputStreamProviders() {
- if (archiveInputStreamProviders == null) {
- archiveInputStreamProviders = Collections
- .unmodifiableSortedMap(findAvailableArchiveInputStreamProviders());
- }
- return archiveInputStreamProviders;
- }
-
- public SortedMap<String, ArchiveStreamProvider> getArchiveOutputStreamProviders() {
- if (archiveOutputStreamProviders == null) {
- archiveOutputStreamProviders = Collections
- .unmodifiableSortedMap(findAvailableArchiveOutputStreamProviders());
- }
- return archiveOutputStreamProviders;
- }
-
- @Override
- public Set<String> getInputStreamArchiveNames() {
- return Sets.newHashSet(AR, ARJ, ZIP, TAR, JAR, CPIO, DUMP, SEVEN_Z);
- }
-
- @Override
- public Set<String> getOutputStreamArchiveNames() {
- return Sets.newHashSet(AR, ZIP, TAR, JAR, CPIO, SEVEN_Z);
- }
-
-}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaCompressorStreamFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaCompressorStreamFactory.java
deleted file mode 100644
index a1a8405..0000000
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaCompressorStreamFactory.java
+++ /dev/null
@@ -1,551 +0,0 @@
-package org.apache.tika.parser.pkg;
- /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.security.AccessController;
-import java.security.PrivilegedAction;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.Set;
-import java.util.SortedMap;
-import java.util.TreeMap;
-
-import org.apache.commons.compress.compressors.CompressorException;
-import org.apache.commons.compress.compressors.CompressorInputStream;
-import org.apache.commons.compress.compressors.CompressorOutputStream;
-import org.apache.commons.compress.compressors.CompressorStreamProvider;
-import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
-import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
-import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
-import org.apache.commons.compress.compressors.lzma.LZMAUtils;
-import org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream;
-import org.apache.commons.compress.compressors.snappy.FramedSnappyCompressorInputStream;
-import org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream;
-import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
-import org.apache.commons.compress.compressors.xz.XZUtils;
-import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
-import org.apache.commons.compress.utils.IOUtils;
-import org.apache.commons.compress.utils.Lists;
-import org.apache.commons.compress.utils.ServiceLoaderIterator;
-import org.apache.commons.compress.utils.Sets;
-import org.apache.tika.exception.TikaMemoryLimitException;
-import org.tukaani.xz.LZMAInputStream;
-import org.tukaani.xz.MemoryLimitException;
-
-/**
- * This is a temporary copy/paste hack from commons-compress for Tika 1.15
- * that 1) allows detection without initialization of a stream and
- * 2) prevents easily preventable OOM on two file formats.
- *
- * Once commons-compress 1.14 is released, we will delete this class
- * and go back to commons-compress's CompressorStreamFactory.
- */
-@Deprecated
-class TikaCompressorStreamFactory implements CompressorStreamProvider {
-
-
-
- private static final TikaCompressorStreamFactory SINGLETON = new TikaCompressorStreamFactory(true, -1);
-
- /**
- * Constant (value {@value}) used to identify the BZIP2 compression
- * algorithm.
- *
- * @since 1.1
- */
- public static final String BZIP2 = "bzip2";
-
- /**
- * Constant (value {@value}) used to identify the GZIP compression
- * algorithm.
- *
- * @since 1.1
- */
- public static final String GZIP = "gz";
-
- /**
- * Constant (value {@value}) used to identify the PACK200 compression
- * algorithm.
- *
- * @since 1.3
- */
- public static final String PACK200 = "pack200";
-
- /**
- * Constant (value {@value}) used to identify the XZ compression method.
- *
- * @since 1.4
- */
- public static final String XZ = "xz";
-
- /**
- * Constant (value {@value}) used to identify the LZMA compression method.
- *
- * @since 1.6
- */
- public static final String LZMA = "lzma";
-
- /**
- * Constant (value {@value}) used to identify the "framed" Snappy
- * compression method.
- *
- * @since 1.7
- */
- public static final String SNAPPY_FRAMED = "snappy-framed";
-
- /**
- * Constant (value {@value}) used to identify the "raw" Snappy compression
- * method. Not supported as an output stream type.
- *
- * @since 1.7
- */
- public static final String SNAPPY_RAW = "snappy-raw";
-
- /**
- * Constant (value {@value}) used to identify the traditional Unix compress
- * method. Not supported as an output stream type.
- *
- * @since 1.7
- */
- public static final String Z = "z";
-
- /**
- * Constant (value {@value}) used to identify the Deflate compress method.
- *
- * @since 1.9
- */
- public static final String DEFLATE = "deflate";
-
-
- private final int memoryLimitInKb;
-
- private SortedMap<String, CompressorStreamProvider> compressorInputStreamProviders;
-
-
- public static String getBzip2() {
- return BZIP2;
- }
-
- public static String getDeflate() {
- return DEFLATE;
- }
-
- public static String getGzip() {
- return GZIP;
- }
-
- public static String getLzma() {
- return LZMA;
- }
-
- public static String getPack200() {
- return PACK200;
- }
-
- public static TikaCompressorStreamFactory getSingleton() {
- return SINGLETON;
- }
-
- public static String getSnappyFramed() {
- return SNAPPY_FRAMED;
- }
-
- public static String getSnappyRaw() {
- return SNAPPY_RAW;
- }
-
- public static String getXz() {
- return XZ;
- }
-
- public static String getZ() {
- return Z;
- }
-
- static void putAll(final Set<String> names, final CompressorStreamProvider provider,
- final TreeMap<String, CompressorStreamProvider> map) {
- for (final String name : names) {
- map.put(toKey(name), provider);
- }
- }
-
- private static String toKey(final String name) {
- return name.toUpperCase(Locale.ROOT);
- }
-
- /**
- * If true, decompress until the end of the input. If false, stop after the
- * first stream and leave the input position to point to the next byte after
- * the stream
- */
- private final Boolean decompressUntilEOF;
-
- /**
- * If true, decompress until the end of the input. If false, stop after the
- * first stream and leave the input position to point to the next byte after
- * the stream
- */
- private volatile boolean decompressConcatenated = false;
-
- /**
- * Create an instance with the provided decompress Concatenated option.
- *
- * @param decompressUntilEOF
- * if true, decompress until the end of the input; if false, stop
- * after the first stream and leave the input position to point
- * to the next byte after the stream. This setting applies to the
- * gzip, bzip2 and xz formats only.
- * @since 1.10
- */
- public TikaCompressorStreamFactory(final boolean decompressUntilEOF, final int memoryLimitInKb) {
- this.decompressUntilEOF = Boolean.valueOf(decompressUntilEOF);
- // Also copy to existing variable so can continue to use that as the
- // current value
- this.decompressConcatenated = decompressUntilEOF;
- this.memoryLimitInKb = memoryLimitInKb;
- }
-
- /**
- * Try to detect the type of compressor stream.
- *
- * @param in input stream
- * @return type of compressor stream detected
- * @throws CompressorException if no compressor stream type was detected
- * or if something else went wrong
- * @throws IllegalArgumentException if stream is null or does not support mark
- *
- * @since 1.14
- */
- public static String detect(final InputStream in) throws CompressorException {
- if (in == null) {
- throw new IllegalArgumentException("Stream must not be null.");
- }
-
- if (!in.markSupported()) {
- throw new IllegalArgumentException("Mark is not supported.");
- }
-
- final byte[] signature = new byte[12];
- in.mark(signature.length);
- int signatureLength = -1;
- try {
- signatureLength = IOUtils.readFully(in, signature);
- in.reset();
- } catch (IOException e) {
- throw new CompressorException("IOException while reading signature.", e);
- }
-
- if (BZip2CompressorInputStream.matches(signature, signatureLength)) {
- return BZIP2;
- }
-
- if (GzipCompressorInputStream.matches(signature, signatureLength)) {
- return GZIP;
- }
-
- if (Pack200CompressorInputStream.matches(signature, signatureLength)) {
- return PACK200;
- }
-
- if (FramedSnappyCompressorInputStream.matches(signature, signatureLength)) {
- return SNAPPY_FRAMED;
- }
-
- if (ZCompressorInputStream.matches(signature, signatureLength)) {
- return Z;
- }
-
- if (DeflateCompressorInputStream.matches(signature, signatureLength)) {
- return DEFLATE;
- }
-
- if (XZUtils.matches(signature, signatureLength)) {
- return XZ;
- }
-
- if (LZMAUtils.matches(signature, signatureLength)) {
- return LZMA;
- }
-
-/* if (FramedLZ4CompressorInputStream.matches(signature, signatureLength)) {
- return LZ4_FRAMED;
- }*/
-
- throw new CompressorException("No Compressor found for the stream signature.");
- }
-
- public SortedMap<String, CompressorStreamProvider> getCompressorInputStreamProviders() {
- if (compressorInputStreamProviders == null) {
- compressorInputStreamProviders = Collections
- .unmodifiableSortedMap(findAvailableCompressorInputStreamProviders());
- }
- return compressorInputStreamProviders;
- }
-
- public static SortedMap<String, CompressorStreamProvider> findAvailableCompressorInputStreamProviders() {
- return AccessController.doPrivileged(new PrivilegedAction<SortedMap<String, CompressorStreamProvider>>() {
- @Override
- public SortedMap<String, CompressorStreamProvider> run() {
- final TreeMap<String, CompressorStreamProvider> map = new TreeMap<>();
- putAll(SINGLETON.getInputStreamCompressorNames(), SINGLETON, map);
- for (final CompressorStreamProvider provider : findCompressorStreamProviders()) {
- putAll(provider.getInputStreamCompressorNames(), provider, map);
- }
- return map;
- }
- });
- }
-
- private static ArrayList<CompressorStreamProvider> findCompressorStreamProviders() {
- return Lists.newArrayList(serviceLoaderIterator());
- }
-
- private static Iterator<CompressorStreamProvider> serviceLoaderIterator() {
- return new ServiceLoaderIterator<>(CompressorStreamProvider.class);
- }
-
- /**
- * Create an compressor input stream from an input stream, autodetecting the
- * compressor type from the first few bytes of the stream. The InputStream
- * must support marks, like BufferedInputStream.
- *
- * @param in
- * the input stream
- * @return the compressor input stream
- * @throws CompressorException
- * if the compressor name is not known
- * @throws IllegalArgumentException
- * if the stream is null or does not support mark
- * @since 1.1
- */
- public CompressorInputStream createCompressorInputStream(final InputStream in) throws CompressorException,
- TikaMemoryLimitException {
- return createCompressorInputStream(detect(in), in);
- }
-
- /**
- * Creates a compressor input stream from a compressor name and an input
- * stream.
- *
- * @param name
- * of the compressor, i.e. {@value #GZIP}, {@value #BZIP2},
- * {@value #XZ}, {@value #LZMA}, {@value #PACK200},
- * {@value #SNAPPY_RAW}, {@value #SNAPPY_FRAMED}, {@value #Z},
- * or {@value #DEFLATE}
- * @param in
- * the input stream
- * @return compressor input stream
- * @throws CompressorException
- * if the compressor name is not known or not available
- * @throws IllegalArgumentException
- * if the name or input stream is null
- */
- public CompressorInputStream createCompressorInputStream(final String name, final InputStream in)
- throws CompressorException, TikaMemoryLimitException {
- return createCompressorInputStream(name, in, decompressConcatenated);
- }
-
- public CompressorInputStream createCompressorInputStream(final String name, final InputStream in,
- final boolean actualDecompressConcatenated) throws CompressorException {
- if (name == null || in == null) {
- throw new IllegalArgumentException("Compressor name and stream must not be null.");
- }
-
- try {
-
- if (GZIP.equalsIgnoreCase(name)) {
- return new GzipCompressorInputStream(in, actualDecompressConcatenated);
- }
-
- if (BZIP2.equalsIgnoreCase(name)) {
- return new BZip2CompressorInputStream(in, actualDecompressConcatenated);
- }
-
- if (XZ.equalsIgnoreCase(name)) {
- if (!XZUtils.isXZCompressionAvailable()) {
- throw new CompressorException("XZ compression is not available.");
- }
- return new XZCompressorInputStream(in, actualDecompressConcatenated);
- }
-
- if (LZMA.equalsIgnoreCase(name)) {
- if (!LZMAUtils.isLZMACompressionAvailable()) {
- throw new CompressorException("LZMA compression is not available");
- }
- try {
- return new SaferLZMACompressorInputStream(in);
- } catch (MemoryLimitException e) {
- throw new CompressorException("MemoryLimitException: " + e.getMessage(), e);
- }
- }
-
- if (PACK200.equalsIgnoreCase(name)) {
- return new Pack200CompressorInputStream(in);
- }
-
- if (SNAPPY_RAW.equalsIgnoreCase(name)) {
- return new SnappyCompressorInputStream(in);
- }
-
- if (SNAPPY_FRAMED.equalsIgnoreCase(name)) {
- return new FramedSnappyCompressorInputStream(in);
- }
-
- if (Z.equalsIgnoreCase(name)) {
- try {
- return new SaferZCompressorInputStream(in);
- } catch (TikaRuntimeMemoryLimitException e) {
- throw new CompressorException("MemoryLimitException: " + e.getMessage(), e);
- }
- }
-
- if (DEFLATE.equalsIgnoreCase(name)) {
- return new DeflateCompressorInputStream(in);
- }
-/*
-not currently supported
- if (LZ4_BLOCK.equalsIgnoreCase(name)) {
- return new BlockLZ4CompressorInputStream(in);
- }
-
- if (LZ4_FRAMED.equalsIgnoreCase(name)) {
- return new FramedLZ4CompressorInputStream(in, actualDecompressConcatenated);
- }
- */
-
- } catch (final IOException e) {
- throw new CompressorException("Could not create CompressorInputStream.", e);
- }
-
- final CompressorStreamProvider compressorStreamProvider = getCompressorInputStreamProviders().get(toKey(name));
- if (compressorStreamProvider != null) {
- return compressorStreamProvider.createCompressorInputStream(name, in, actualDecompressConcatenated);
- }
-
- throw new CompressorException("Compressor: " + name + " not found.");
- }
-
- @Override
- public CompressorOutputStream createCompressorOutputStream(String s, OutputStream outputStream) throws CompressorException {
- throw new UnsupportedOperationException();
- }
-
-
- // For Unit tests
- boolean getDecompressConcatenated() {
- return decompressConcatenated;
- }
-
- public Set<String> getInputStreamCompressorNames() {
- return Sets.newHashSet(GZIP, BZIP2, XZ, LZMA, PACK200, DEFLATE, SNAPPY_RAW, SNAPPY_FRAMED, Z);
- }
-
- @Override
- public Set<String> getOutputStreamCompressorNames() {
- throw new UnsupportedOperationException();
- }
-
- public Boolean getDecompressUntilEOF() {
- return decompressUntilEOF;
- }
-
- private class SaferZCompressorInputStream extends ZCompressorInputStream {
-
- public SaferZCompressorInputStream(InputStream inputStream) throws IOException {
- super(inputStream);
- }
-
- @Override
- protected void initializeTables(int maxCodeSize) {
- int maxTableSize = 1 << maxCodeSize;
- if (memoryLimitInKb > -1 && maxTableSize > (memoryLimitInKb*1024)) {
- throw new TikaRuntimeMemoryLimitException("Calculated maxCodeSize ("+maxCodeSize+" bytes) is greater "+
- "than the maximum allowable ("+ (memoryLimitInKb*1024) +" bytes).\n"+
- "If the file is not corrupt, consider increasing " +
- "the memoryLimitInKb parameter in the CompressorParser");
- }
- super.initializeTables(maxCodeSize);
- }
- }
-
- private static class TikaRuntimeMemoryLimitException extends RuntimeException {
- public TikaRuntimeMemoryLimitException(String msg) {
- super(msg);
- }
- }
-
- private class SaferLZMACompressorInputStream extends CompressorInputStream {
- private final InputStream in;
-
- /**
- * Creates a new input stream that decompresses LZMA-compressed data
- * from the specified input stream.
- *
- * @param inputStream where to read the compressed data
- *
- * @throws IOException if the input is not in the .lzma format,
- * the input is corrupt or truncated, the .lzma
- * headers specify sizes that are not supported
- * by this implementation, or the underlying
- * <code>inputStream</code> throws an exception
- */
- public SaferLZMACompressorInputStream(final InputStream inputStream) throws IOException {
- in = new LZMAInputStream(inputStream, memoryLimitInKb);
- }
-
- /** {@inheritDoc} */
- @Override
- public int read() throws IOException {
- final int ret = in.read();
- count(ret == -1 ? 0 : 1);
- return ret;
- }
-
- /** {@inheritDoc} */
- @Override
- public int read(final byte[] buf, final int off, final int len) throws IOException {
- final int ret = in.read(buf, off, len);
- count(ret);
- return ret;
- }
-
- /** {@inheritDoc} */
- @Override
- public long skip(final long n) throws IOException {
- return in.skip(n);
- }
-
- /** {@inheritDoc} */
- @Override
- public int available() throws IOException {
- return in.available();
- }
-
- /** {@inheritDoc} */
- @Override
- public void close() throws IOException {
- in.close();
- }
- }
-}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 495fd2d..3f9211b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -29,9 +29,11 @@ import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.compress.archivers.ArchiveException;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.compress.compressors.CompressorException;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.io.IOUtils;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.OPCPackage;
@@ -100,7 +102,7 @@ public class ZipContainerDetector implements Detector {
private static MediaType detectCompressorFormat(byte[] prefix, int length) {
try {
- String type = TikaCompressorStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
+ String type = CompressorStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
return CompressorParser.getMediaType(type);
} catch (CompressorException e) {
return MediaType.OCTET_STREAM;
@@ -109,7 +111,7 @@ public class ZipContainerDetector implements Detector {
private static MediaType detectArchiveFormat(byte[] prefix, int length) {
try {
- String name = TikaArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
+ String name = ArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
return PackageParser.getMediaType(name);
} catch (ArchiveException e) {
return MediaType.OCTET_STREAM;
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
index 77531fc..444afc7 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
@@ -18,22 +18,46 @@
package org.apache.tika.parser.pkg;
+import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
import java.util.HashSet;
import java.util.Set;
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.junit.BeforeClass;
import org.junit.Test;
-public class CompressorParserTest {
+public class CompressorParserTest extends TikaTest {
+ //These compressed stream types can't currently
+ //be detected.
private static Set<MediaType> NOT_COVERED = new HashSet();
@BeforeClass
public static void setUp() {
- NOT_COVERED.add(MediaType.application("x-snappy-framed"));
+ NOT_COVERED.add(MediaType.application("x-brotli"));
+ NOT_COVERED.add(MediaType.application("x-lz4-block"));
+ NOT_COVERED.add(MediaType.application("x-snappy-raw"));
+ }
+
+ @Test
+ public void testSnappyFramed() throws Exception {
+ XMLResult r = getXML("testSnappy-framed.sz");
+ assertEquals("application/x-snappy", r.metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Lorem ipsum dolor sit amet", r.xml);
+ }
+
+ @Test
+ public void testLZ4Framed() throws Exception {
+ XMLResult r = getXML("testLZ4-framed.lz4");
+ assertEquals("application/x-lz4", r.metadata.get(Metadata.CONTENT_TYPE));
+ //xml parser throws an exception for test1.xml
+ //for now, be content that the container file is correctly identified
+ assertContains("test1.xml", r.xml);
}
@Test
@@ -41,7 +65,7 @@ public class CompressorParserTest {
//test that the package parser covers all inputstreams handled
//by CompressorStreamFactory. When we update commons-compress, and they add
//a new stream type, we want to make sure that we're handling it.
- TikaCompressorStreamFactory archiveStreamFactory = new TikaCompressorStreamFactory(true, 1000);
+ CompressorStreamFactory archiveStreamFactory = new CompressorStreamFactory(true, 1000);
CompressorParser compressorParser = new CompressorParser();
ParseContext parseContext = new ParseContext();
for (String name : archiveStreamFactory.getInputStreamCompressorNames()) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
index 412228c..12b7bb8 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
@@ -22,6 +22,7 @@ import static org.junit.Assert.fail;
import java.nio.charset.StandardCharsets;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.junit.Test;
@@ -33,7 +34,7 @@ public class PackageParserTest {
//test that the package parser covers all inputstreams handled
//by ArchiveStreamFactory. When we update commons-compress, and they add
//a new stream type, we want to make sure that we're handling it.
- TikaArchiveStreamFactory archiveStreamFactory = new TikaArchiveStreamFactory(StandardCharsets.UTF_8.name());
+ ArchiveStreamFactory archiveStreamFactory = new ArchiveStreamFactory(StandardCharsets.UTF_8.name());
PackageParser packageParser = new PackageParser();
ParseContext parseContext = new ParseContext();
for (String name : archiveStreamFactory.getInputStreamArchiveNames()) {
diff --git a/tika-parsers/src/test/resources/test-documents/testLZ4-framed.lz4 b/tika-parsers/src/test/resources/test-documents/testLZ4-framed.lz4
new file mode 100644
index 0000000..d2a813f
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testLZ4-framed.lz4 differ
diff --git a/tika-parsers/src/test/resources/test-documents/testSnappy-framed.sz b/tika-parsers/src/test/resources/test-documents/testSnappy-framed.sz
new file mode 100644
index 0000000..9a6b1fb
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testSnappy-framed.sz differ
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.