You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/04/26 14:36:57 UTC
[tika] branch master updated: TIKA-2099 -- temporarily copy/paste commons-compress' ArchiveStreamFactory to benefit from updates that enable detection of magic-less .tar files.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

The following commit(s) were added to refs/heads/master by this push:
       new  a194bc4   TIKA-2099 -- temporarily copy/paste commons-compress' ArchiveStreamFactory to benefit from updates that enable detection of magic-less .tar files.
a194bc4 is described below

commit a194bc4ba2b771807e93e664245f24e3573a15df
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Apr 26 10:36:47 2017 -0400

    TIKA-2099 -- temporarily copy/paste commons-compress' ArchiveStreamFactory
    to benefit from updates that enable detection of magic-less .tar files.
---
 .../org/apache/tika/parser/pkg/PackageParser.java  |  25 +-
 .../tika/parser/pkg/TikaArchiveStreamFactory.java  | 565 +++++++++++++++++++++
 .../tika/parser/pkg/ZipContainerDetector.java      |  19 +-
 .../tika/detect/TestContainerAwareDetector.java    |   5 +
 .../tika/parser/pkg/CompressorParserTest.java      |  63 +++
 .../apache/tika/parser/pkg/PackageParserTest.java  |  52 ++
 .../resources/test-documents/testTAR_no_magic.tar  | Bin 0 -> 156160 bytes
 7 files changed, 711 insertions(+), 18 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 0bf15cf..119c2e6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -79,13 +79,14 @@ public class PackageParser extends AbstractParser {
     private static final MediaType ZIP = MediaType.APPLICATION_ZIP;
     private static final MediaType JAR = MediaType.application("java-archive");
     private static final MediaType AR = MediaType.application("x-archive");
+    private static final MediaType ARJ = MediaType.application("x-arj");
     private static final MediaType CPIO = MediaType.application("x-cpio");
     private static final MediaType DUMP = MediaType.application("x-tika-unix-dump");
     private static final MediaType TAR = MediaType.application("x-tar");
     private static final MediaType SEVENZ = MediaType.application("x-7z-compressed");
 
     private static final Set<MediaType> SUPPORTED_TYPES =
-            MediaType.set(ZIP, JAR, AR, CPIO, DUMP, TAR, SEVENZ);
+            MediaType.set(ZIP, JAR, AR, ARJ, CPIO, DUMP, TAR, SEVENZ);
 
     //this can't be static because of the ForkParser
     //lazily load this when parse is called if it is null.
@@ -93,6 +94,7 @@ public class PackageParser extends AbstractParser {
 
     private final Object lock = new Object[0];
 
+    @Deprecated
     static MediaType getMediaType(ArchiveInputStream stream) {
         if (stream instanceof JarArchiveInputStream) {
             return JAR;
@@ -113,6 +115,27 @@ public class PackageParser extends AbstractParser {
         }
     }
 
+    static MediaType getMediaType(String name) {
+        if (TikaArchiveStreamFactory.JAR.equals(name)) {
+            return JAR;
+        } else if (TikaArchiveStreamFactory.ZIP.equals(name)) {
+            return ZIP;
+        } else if (TikaArchiveStreamFactory.AR.equals(name)) {
+            return AR;
+        } else if (TikaArchiveStreamFactory.ARJ.equals(name)) {
+            return ARJ;
+        } else if (TikaArchiveStreamFactory.CPIO.equals(name)) {
+            return CPIO;
+        } else if (TikaArchiveStreamFactory.DUMP.equals(name)) {
+            return DUMP;
+        } else if (TikaArchiveStreamFactory.TAR.equals(name)) {
+            return TAR;
+        } else if (TikaArchiveStreamFactory.SEVEN_Z.equals(name)) {
+            return SEVENZ;
+        } else {
+            return MediaType.OCTET_STREAM;
+        }
+    }
     static boolean isZipArchive(MediaType type) {
         return type.equals(ZIP) || type.equals(JAR);
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaArchiveStreamFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaArchiveStreamFactory.java
new file mode 100644
index 0000000..c4b534f
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/TikaArchiveStreamFactory.java
@@ -0,0 +1,565 @@
+package org.apache.tika.parser.pkg;
+    /*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.security.AccessController;
+import java.security.PrivilegedAction;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.apache.commons.compress.archivers.ArchiveException;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.ArchiveOutputStream;
+import org.apache.commons.compress.archivers.ArchiveStreamFactory;
+import org.apache.commons.compress.archivers.ArchiveStreamProvider;
+import org.apache.commons.compress.archivers.StreamingNotSupportedException;
+import org.apache.commons.compress.archivers.ar.ArArchiveInputStream;
+import org.apache.commons.compress.archivers.ar.ArArchiveOutputStream;
+import org.apache.commons.compress.archivers.arj.ArjArchiveInputStream;
+import org.apache.commons.compress.archivers.cpio.CpioArchiveInputStream;
+import org.apache.commons.compress.archivers.cpio.CpioArchiveOutputStream;
+import org.apache.commons.compress.archivers.dump.DumpArchiveInputStream;
+import org.apache.commons.compress.archivers.jar.JarArchiveInputStream;
+import org.apache.commons.compress.archivers.jar.JarArchiveOutputStream;
+import org.apache.commons.compress.archivers.sevenz.SevenZFile;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
+import org.apache.commons.compress.utils.IOUtils;
+import org.apache.commons.compress.utils.Lists;
+import org.apache.commons.compress.utils.ServiceLoaderIterator;
+import org.apache.commons.compress.utils.Sets;
+
+/**
+ * This is a temporary copy/paste hack from commons-compress for Tika 1.15
+ * that 1) allows detection without initialization of a stream and
+ * 2) prevents easily preventable OOM on three file formats.
+ *
+ * Once commons-compress 1.14 is released, we will delete this class
+ * and go back to commons-compress's CompressorStreamFactory.
+ */
+@Deprecated
+class TikaArchiveStreamFactory implements ArchiveStreamProvider {
+
+    private static final int TAR_HEADER_SIZE = 512;
+
+    private static final int DUMP_SIGNATURE_SIZE = 32;
+
+    private static final int SIGNATURE_SIZE = 12;
+
+    private static final ArchiveStreamFactory SINGLETON = new ArchiveStreamFactory();
+
+    /**
+     * Constant (value {@value}) used to identify the AR archive format.
+     * @since 1.1
+     */
+    public static final String AR = "ar";
+
+    /**
+     * Constant (value {@value}) used to identify the ARJ archive format.
+     * Not supported as an output stream type.
+     * @since 1.6
+     */
+    public static final String ARJ = "arj";
+
+    /**
+     * Constant (value {@value}) used to identify the CPIO archive format.
+     * @since 1.1
+     */
+    public static final String CPIO = "cpio";
+
+    /**
+     * Constant (value {@value}) used to identify the Unix DUMP archive format.
+     * Not supported as an output stream type.
+     * @since 1.3
+     */
+    public static final String DUMP = "dump";
+
+    /**
+     * Constant (value {@value}) used to identify the JAR archive format.
+     * @since 1.1
+     */
+    public static final String JAR = "jar";
+
+    /**
+     * Constant used to identify the TAR archive format.
+     * @since 1.1
+     */
+    public static final String TAR = "tar";
+
+    /**
+     * Constant (value {@value}) used to identify the ZIP archive format.
+     * @since 1.1
+     */
+    public static final String ZIP = "zip";
+
+    /**
+     * Constant (value {@value}) used to identify the 7z archive format.
+     * @since 1.8
+     */
+    public static final String SEVEN_Z = "7z";
+
+    /**
+     * Entry encoding, null for the platform default.
+     */
+    private final String encoding;
+
+    /**
+     * Entry encoding, null for the default.
+     */
+    private volatile String entryEncoding;
+
+    private SortedMap<String, ArchiveStreamProvider> archiveInputStreamProviders;
+
+    private SortedMap<String, ArchiveStreamProvider> archiveOutputStreamProviders;
+
+    private static ArrayList<ArchiveStreamProvider> findArchiveStreamProviders() {
+        return Lists.newArrayList(serviceLoaderIterator());
+    }
+
+    static void putAll(Set<String> names, ArchiveStreamProvider provider,
+                       TreeMap<String, ArchiveStreamProvider> map) {
+        for (String name : names) {
+            map.put(toKey(name), provider);
+        }
+    }
+
+    private static Iterator<ArchiveStreamProvider> serviceLoaderIterator() {
+        return new ServiceLoaderIterator<>(ArchiveStreamProvider.class);
+    }
+
+    private static String toKey(final String name) {
+        return name.toUpperCase(Locale.ROOT);
+    }
+
+    /**
+     * Constructs a new sorted map from input stream provider names to provider
+     * objects.
+     *
+     * <p>
+     * The map returned by this method will have one entry for each provider for
+     * which support is available in the current Java virtual machine. If two or
+     * more supported provider have the same name then the resulting map will
+     * contain just one of them; which one it will contain is not specified.
+     * </p>
+     *
+     * <p>
+     * The invocation of this method, and the subsequent use of the resulting
+     * map, may cause time-consuming disk or network I/O operations to occur.
+     * This method is provided for applications that need to enumerate all of
+     * the available providers, for example to allow user provider selection.
+     * </p>
+     *
+     * <p>
+     * This method may return different results at different times if new
+     * providers are dynamically made available to the current Java virtual
+     * machine.
+     * </p>
+     *
+     * @return An immutable, map from names to provider objects
+     * @since 1.13
+     */
+    public static SortedMap<String, ArchiveStreamProvider> findAvailableArchiveInputStreamProviders() {
+        return AccessController.doPrivileged(new PrivilegedAction<SortedMap<String, ArchiveStreamProvider>>() {
+            @Override
+            public SortedMap<String, ArchiveStreamProvider> run() {
+                TreeMap<String, ArchiveStreamProvider> map = new TreeMap<>();
+                putAll(SINGLETON.getInputStreamArchiveNames(), SINGLETON, map);
+                for (ArchiveStreamProvider provider : findArchiveStreamProviders()) {
+                    putAll(provider.getInputStreamArchiveNames(), provider, map);
+                }
+                return map;
+            }
+        });
+    }
+
+    /**
+     * Constructs a new sorted map from output stream provider names to provider
+     * objects.
+     *
+     * <p>
+     * The map returned by this method will have one entry for each provider for
+     * which support is available in the current Java virtual machine. If two or
+     * more supported provider have the same name then the resulting map will
+     * contain just one of them; which one it will contain is not specified.
+     * </p>
+     *
+     * <p>
+     * The invocation of this method, and the subsequent use of the resulting
+     * map, may cause time-consuming disk or network I/O operations to occur.
+     * This method is provided for applications that need to enumerate all of
+     * the available providers, for example to allow user provider selection.
+     * </p>
+     *
+     * <p>
+     * This method may return different results at different times if new
+     * providers are dynamically made available to the current Java virtual
+     * machine.
+     * </p>
+     *
+     * @return An immutable, map from names to provider objects
+     * @since 1.13
+     */
+    public static SortedMap<String, ArchiveStreamProvider> findAvailableArchiveOutputStreamProviders() {
+        return AccessController.doPrivileged(new PrivilegedAction<SortedMap<String, ArchiveStreamProvider>>() {
+            @Override
+            public SortedMap<String, ArchiveStreamProvider> run() {
+                TreeMap<String, ArchiveStreamProvider> map = new TreeMap<>();
+                putAll(SINGLETON.getOutputStreamArchiveNames(), SINGLETON, map);
+                for (ArchiveStreamProvider provider : findArchiveStreamProviders()) {
+                    putAll(provider.getOutputStreamArchiveNames(), provider, map);
+                }
+                return map;
+            }
+        });
+    }
+
+
+    /**
+     * Create an instance using the specified encoding.
+     *
+     * @param encoding the encoding to be used.
+     *
+     * @since 1.10
+     */
+    public TikaArchiveStreamFactory(final String encoding) {
+        super();
+        this.encoding = encoding;
+        // Also set the original field so can continue to use it.
+        this.entryEncoding = encoding;
+    }
+
+    /**
+     * Returns the encoding to use for arj, jar, zip, dump, cpio and tar
+     * files, or null for the archiver default.
+     *
+     * @return entry encoding, or null for the archiver default
+     * @since 1.5
+     */
+    public String getEntryEncoding() {
+        return entryEncoding;
+    }
+
+    /**
+     * Sets the encoding to use for arj, jar, zip, dump, cpio and tar files. Use null for the archiver default.
+     *
+     * @param entryEncoding the entry encoding, null uses the archiver default.
+     * @since 1.5
+     * @deprecated 1.10 use {@link #TikaArchiveStreamFactory(String)} to specify the encoding
+     * @throws IllegalStateException if the constructor {@link #TikaArchiveStreamFactory(String)}
+     * was used to specify the factory encoding.
+     */
+    @Deprecated
+    public void setEntryEncoding(final String entryEncoding) {
+        // Note: this does not detect new ArchiveStreamFactory(null) but that does not set the encoding anyway
+        if (encoding != null) {
+            throw new IllegalStateException("Cannot overide encoding set by the constructor");
+        }
+        this.entryEncoding = entryEncoding;
+    }
+
+    /**
+     * Creates an archive input stream from an archiver name and an input stream.
+     *
+     * @param archiverName the archive name,
+     * i.e. {@value #AR}, {@value #ARJ}, {@value #ZIP}, {@value #TAR}, {@value #JAR}, {@value #CPIO}, {@value #DUMP} or {@value #SEVEN_Z}
+     * @param in the input stream
+     * @return the archive input stream
+     * @throws ArchiveException if the archiver name is not known
+     * @throws StreamingNotSupportedException if the format cannot be
+     * read from a stream
+     * @throws IllegalArgumentException if the archiver name or stream is null
+     */
+    public ArchiveInputStream createArchiveInputStream(final String archiverName, final InputStream in)
+            throws ArchiveException {
+        return createArchiveInputStream(archiverName, in, entryEncoding);
+    }
+
+    @Override
+    public ArchiveInputStream createArchiveInputStream(final String archiverName, final InputStream in,
+                                                       final String actualEncoding) throws ArchiveException {
+
+        if (archiverName == null) {
+            throw new IllegalArgumentException("Archivername must not be null.");
+        }
+
+        if (in == null) {
+            throw new IllegalArgumentException("InputStream must not be null.");
+        }
+
+        if (AR.equalsIgnoreCase(archiverName)) {
+            return new ArArchiveInputStream(in);
+        }
+        if (ARJ.equalsIgnoreCase(archiverName)) {
+            if (actualEncoding != null) {
+                return new ArjArchiveInputStream(in, actualEncoding);
+            }
+            return new ArjArchiveInputStream(in);
+        }
+        if (ZIP.equalsIgnoreCase(archiverName)) {
+            if (actualEncoding != null) {
+                return new ZipArchiveInputStream(in, actualEncoding);
+            }
+            return new ZipArchiveInputStream(in);
+        }
+        if (TAR.equalsIgnoreCase(archiverName)) {
+            if (actualEncoding != null) {
+                return new TarArchiveInputStream(in, actualEncoding);
+            }
+            return new TarArchiveInputStream(in);
+        }
+        if (JAR.equalsIgnoreCase(archiverName)) {
+            if (actualEncoding != null) {
+                return new JarArchiveInputStream(in, actualEncoding);
+            }
+            return new JarArchiveInputStream(in);
+        }
+        if (CPIO.equalsIgnoreCase(archiverName)) {
+            if (actualEncoding != null) {
+                return new CpioArchiveInputStream(in, actualEncoding);
+            }
+            return new CpioArchiveInputStream(in);
+        }
+        if (DUMP.equalsIgnoreCase(archiverName)) {
+            if (actualEncoding != null) {
+                return new DumpArchiveInputStream(in, actualEncoding);
+            }
+            return new DumpArchiveInputStream(in);
+        }
+        if (SEVEN_Z.equalsIgnoreCase(archiverName)) {
+            throw new StreamingNotSupportedException(SEVEN_Z);
+        }
+
+        final ArchiveStreamProvider archiveStreamProvider = getArchiveInputStreamProviders().get(toKey(archiverName));
+        if (archiveStreamProvider != null) {
+            return archiveStreamProvider.createArchiveInputStream(archiverName, in, actualEncoding);
+        }
+
+        throw new ArchiveException("Archiver: " + archiverName + " not found.");
+    }
+
+    /**
+     * Creates an archive output stream from an archiver name and an output stream.
+     *
+     * @param archiverName the archive name,
+     * i.e. {@value #AR}, {@value #ZIP}, {@value #TAR}, {@value #JAR} or {@value #CPIO}
+     * @param out the output stream
+     * @return the archive output stream
+     * @throws ArchiveException if the archiver name is not known
+     * @throws StreamingNotSupportedException if the format cannot be
+     * written to a stream
+     * @throws IllegalArgumentException if the archiver name or stream is null
+     */
+    public ArchiveOutputStream createArchiveOutputStream(final String archiverName, final OutputStream out)
+            throws ArchiveException {
+        return createArchiveOutputStream(archiverName, out, entryEncoding);
+    }
+
+    @Override
+    public ArchiveOutputStream createArchiveOutputStream(
+            final String archiverName, final OutputStream out, final String actualEncoding)
+            throws ArchiveException {
+        if (archiverName == null) {
+            throw new IllegalArgumentException("Archivername must not be null.");
+        }
+        if (out == null) {
+            throw new IllegalArgumentException("OutputStream must not be null.");
+        }
+
+        if (AR.equalsIgnoreCase(archiverName)) {
+            return new ArArchiveOutputStream(out);
+        }
+        if (ZIP.equalsIgnoreCase(archiverName)) {
+            final ZipArchiveOutputStream zip = new ZipArchiveOutputStream(out);
+            if (actualEncoding != null) {
+                zip.setEncoding(actualEncoding);
+            }
+            return zip;
+        }
+        if (TAR.equalsIgnoreCase(archiverName)) {
+            if (actualEncoding != null) {
+                return new TarArchiveOutputStream(out, actualEncoding);
+            }
+            return new TarArchiveOutputStream(out);
+        }
+        if (JAR.equalsIgnoreCase(archiverName)) {
+            if (actualEncoding != null) {
+                return new JarArchiveOutputStream(out, actualEncoding);
+            }
+            return new JarArchiveOutputStream(out);
+        }
+        if (CPIO.equalsIgnoreCase(archiverName)) {
+            if (actualEncoding != null) {
+                return new CpioArchiveOutputStream(out, actualEncoding);
+            }
+            return new CpioArchiveOutputStream(out);
+        }
+        if (SEVEN_Z.equalsIgnoreCase(archiverName)) {
+            throw new StreamingNotSupportedException(SEVEN_Z);
+        }
+
+        final ArchiveStreamProvider archiveStreamProvider = getArchiveOutputStreamProviders().get(toKey(archiverName));
+        if (archiveStreamProvider != null) {
+            return archiveStreamProvider.createArchiveOutputStream(archiverName, out, actualEncoding);
+        }
+
+        throw new ArchiveException("Archiver: " + archiverName + " not found.");
+    }
+
+    /**
+     * Create an archive input stream from an input stream, autodetecting
+     * the archive type from the first few bytes of the stream. The InputStream
+     * must support marks, like BufferedInputStream.
+     *
+     * @param in the input stream
+     * @return the archive input stream
+     * @throws ArchiveException if the archiver name is not known
+     * @throws StreamingNotSupportedException if the format cannot be
+     * read from a stream
+     * @throws IllegalArgumentException if the stream is null or does not support mark
+     */
+    public ArchiveInputStream createArchiveInputStream(final InputStream in)
+            throws ArchiveException {
+        return createArchiveInputStream(detect(in), in);
+    }
+
+    /**
+     * Try to determine the type of Archiver
+     * @param in input stream
+     * @return type of archiver if found
+     * @throws ArchiveException if an archiver cannot be detected in the stream
+     * @since 1.14
+     */
+    public static String detect(InputStream in) throws ArchiveException {
+        if (in == null) {
+            throw new IllegalArgumentException("Stream must not be null.");
+        }
+
+        if (!in.markSupported()) {
+            throw new IllegalArgumentException("Mark is not supported.");
+        }
+
+        final byte[] signature = new byte[SIGNATURE_SIZE];
+        in.mark(signature.length);
+        int signatureLength = -1;
+        try {
+            signatureLength = IOUtils.readFully(in, signature);
+            in.reset();
+        } catch (IOException e) {
+            throw new ArchiveException("IOException while reading signature.");
+        }
+
+        if (ZipArchiveInputStream.matches(signature, signatureLength)) {
+            return ZIP;
+        } else if (JarArchiveInputStream.matches(signature, signatureLength)) {
+            return JAR;
+        } if (ArArchiveInputStream.matches(signature, signatureLength)) {
+            return AR;
+        } else if (CpioArchiveInputStream.matches(signature, signatureLength)) {
+            return CPIO;
+        } else if (ArjArchiveInputStream.matches(signature, signatureLength)) {
+            return ARJ;
+        } else if (SevenZFile.matches(signature, signatureLength)) {
+            return SEVEN_Z;
+        }
+
+        // Dump needs a bigger buffer to check the signature;
+        final byte[] dumpsig = new byte[DUMP_SIGNATURE_SIZE];
+        in.mark(dumpsig.length);
+        try {
+            signatureLength = IOUtils.readFully(in, dumpsig);
+            in.reset();
+        } catch (IOException e) {
+            throw new ArchiveException("IOException while reading dump signature");
+        }
+        if (DumpArchiveInputStream.matches(dumpsig, signatureLength)) {
+            return DUMP;
+        }
+
+        // Tar needs an even bigger buffer to check the signature; read the first block
+        final byte[] tarHeader = new byte[TAR_HEADER_SIZE];
+        in.mark(tarHeader.length);
+        try {
+            signatureLength = IOUtils.readFully(in, tarHeader);
+            in.reset();
+        } catch (IOException e) {
+            throw new ArchiveException("IOException while reading tar signature");
+        }
+        if (TarArchiveInputStream.matches(tarHeader, signatureLength)) {
+            return TAR;
+        }
+
+        // COMPRESS-117 - improve auto-recognition
+        if (signatureLength >= TAR_HEADER_SIZE) {
+            TarArchiveInputStream tais = null;
+            try {
+                tais = new TarArchiveInputStream(new ByteArrayInputStream(tarHeader));
+                // COMPRESS-191 - verify the header checksum
+                if (tais.getNextTarEntry().isCheckSumOK()) {
+                    return TAR;
+                }
+            } catch (final Exception e) { // NOPMD
+                // can generate IllegalArgumentException as well
+                // as IOException
+                // autodetection, simply not a TAR
+                // ignored
+            } finally {
+                IOUtils.closeQuietly(tais);
+            }
+        }
+        throw new ArchiveException("No Archiver found for the stream signature");
+    }
+
+    public SortedMap<String, ArchiveStreamProvider> getArchiveInputStreamProviders() {
+        if (archiveInputStreamProviders == null) {
+            archiveInputStreamProviders = Collections
+                    .unmodifiableSortedMap(findAvailableArchiveInputStreamProviders());
+        }
+        return archiveInputStreamProviders;
+    }
+
+    public SortedMap<String, ArchiveStreamProvider> getArchiveOutputStreamProviders() {
+        if (archiveOutputStreamProviders == null) {
+            archiveOutputStreamProviders = Collections
+                    .unmodifiableSortedMap(findAvailableArchiveOutputStreamProviders());
+        }
+        return archiveOutputStreamProviders;
+    }
+
+    @Override
+    public Set<String> getInputStreamArchiveNames() {
+        return Sets.newHashSet(AR, ARJ, ZIP, TAR, JAR, CPIO, DUMP, SEVEN_Z);
+    }
+
+    @Override
+    public Set<String> getOutputStreamArchiveNames() {
+        return Sets.newHashSet(AR, ZIP, TAR, JAR, CPIO, SEVEN_Z);
+    }
+
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 2434d1a..411a050 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -29,9 +29,6 @@ import java.util.Set;
 import java.util.regex.Pattern;
 
 import org.apache.commons.compress.archivers.ArchiveException;
-import org.apache.commons.compress.archivers.ArchiveInputStream;
-import org.apache.commons.compress.archivers.ArchiveStreamFactory;
-import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.commons.compress.compressors.CompressorException;
@@ -112,20 +109,8 @@ public class ZipContainerDetector implements Detector {
 
     private static MediaType detectArchiveFormat(byte[] prefix, int length) {
         try {
-            ArchiveStreamFactory factory = new ArchiveStreamFactory();
-            ArchiveInputStream ais = factory.createArchiveInputStream(
-                    new ByteArrayInputStream(prefix, 0, length));
-            try {
-                if ((ais instanceof TarArchiveInputStream)
-                        && !TarArchiveInputStream.matches(prefix, length)) {
-                    // ArchiveStreamFactory is too relaxed, see COMPRESS-117
-                    return MediaType.OCTET_STREAM;
-                } else {
-                    return PackageParser.getMediaType(ais);
-                }
-            } finally {
-                IOUtils.closeQuietly(ais);
-            }
+            String name = TikaArchiveStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
+            return PackageParser.getMediaType(name);
         } catch (ArchiveException e) {
             return MediaType.OCTET_STREAM;
         }
diff --git a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
index 9cff7c4..b6a79eb 100644
--- a/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
+++ b/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
@@ -362,6 +362,11 @@ public class TestContainerAwareDetector {
     }
 
     @Test
+    public void testTarWithNoMagic() throws Exception {
+        assertTypeByData("testTAR_no_magic.tar", "application/x-tar");
+    }
+
+    @Test
     public void testLZMAOOM() throws Exception {
         assertTypeByData("testLZMA_oom", "application/x-lzma");
     }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
new file mode 100644
index 0000000..77531fc
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/CompressorParserTest.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pkg;
+
+
+import static org.junit.Assert.fail;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class CompressorParserTest {
+    private static Set<MediaType> NOT_COVERED = new HashSet();
+
+    @BeforeClass
+    public static void setUp() {
+        NOT_COVERED.add(MediaType.application("x-snappy-framed"));
+    }
+
+    @Test
+    public void testCoverage() throws Exception {
+        //test that the package parser covers all inputstreams handled
+        //by CompressorStreamFactory.  When we update commons-compress, and they add
+        //a new stream type, we want to make sure that we're handling it.
+        TikaCompressorStreamFactory archiveStreamFactory = new TikaCompressorStreamFactory(true, 1000);
+        CompressorParser compressorParser = new CompressorParser();
+        ParseContext parseContext = new ParseContext();
+        for (String name : archiveStreamFactory.getInputStreamCompressorNames()) {
+            MediaType mt = CompressorParser.getMediaType(name);
+            if (NOT_COVERED.contains(mt)) {
+                continue;
+            }
+            //use this instead of assertNotEquals so that we report the
+            //name of the missing stream
+            if (mt.equals(MediaType.OCTET_STREAM)) {
+                fail("getting octet-stream for: "+name);
+            }
+
+            if (! compressorParser.getSupportedTypes(parseContext).contains(mt)) {
+                fail("CompressorParser should support: "+mt.toString());
+            }
+        }
+    }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
new file mode 100644
index 0000000..412228c
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pkg;
+
+
+import static org.junit.Assert.fail;
+
+import java.nio.charset.StandardCharsets;
+
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.junit.Test;
+
+public class PackageParserTest {
+
+    @Test
+    public void testCoverage() throws Exception {
+        //test that the package parser covers all inputstreams handled
+        //by ArchiveStreamFactory.  When we update commons-compress, and they add
+        //a new stream type, we want to make sure that we're handling it.
+        TikaArchiveStreamFactory archiveStreamFactory = new TikaArchiveStreamFactory(StandardCharsets.UTF_8.name());
+        PackageParser packageParser = new PackageParser();
+        ParseContext parseContext = new ParseContext();
+        for (String name : archiveStreamFactory.getInputStreamArchiveNames()) {
+            MediaType mt = PackageParser.getMediaType(name);
+            //use this instead of assertNotEquals so that we report the
+            //name of the missing stream
+            if (mt.equals(MediaType.OCTET_STREAM)) {
+                fail("getting octet-stream for: "+name);
+            }
+
+            if (! packageParser.getSupportedTypes(parseContext).contains(mt)) {
+                fail("PackageParser should support: "+mt.toString());
+            }
+        }
+    }
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testTAR_no_magic.tar b/tika-parsers/src/test/resources/test-documents/testTAR_no_magic.tar
new file mode 100644
index 0000000..0583e4c
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testTAR_no_magic.tar differ

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].