You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/04/19 19:33:08 UTC
[tika] branch master updated: TIKA-2849 -- move to streaming
detection of zip files and apply markLimit to POIFSContainerDetector;
thank you, Jukka!
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 64877c5 TIKA-2849 -- move to streaming detection of zip files and apply markLimit to POIFSContainerDetector; thank you, Jukka!
new cf5b812 Merge branch 'TIKA-2849'
64877c5 is described below
commit 64877c5a6187ecd13e098e99edc97a1f6dc09112
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Apr 12 17:11:03 2019 -0400
TIKA-2849 -- move to streaming detection of zip files and apply markLimit to POIFSContainerDetector; thank you, Jukka!
---
CHANGES.txt | 7 +
.../org/apache/tika/io/BoundedInputStream.java | 118 +++++++
.../java/org/apache/tika/io/TikaInputStream.java | 35 ++-
.../tika/parser/digest/InputStreamDigester.java | 103 +------
.../src/test/java/org/apache/tika/TikaTest.java | 2 +-
.../org/apache/tika/parser/epub/EpubParser.java | 12 +-
.../tika/parser/iwork/IWorkPackageParser.java | 2 +-
.../parser/microsoft/POIFSContainerDetector.java | 22 +-
.../microsoft/ooxml/OOXMLExtractorFactory.java | 42 ++-
.../parser/pkg/StreamingZipContainerDetector.java | 222 ++++++++++++++
.../tika/parser/pkg/ZipContainerDetector.java | 340 +++++++--------------
.../tika/parser/pkg/ZipContainerDetectorBase.java | 162 ++++++++++
.../org/apache/tika/parser/utils/ZipSalvager.java | 75 ++---
.../parser/microsoft/ooxml/TruncatedOOXMLTest.java | 1 +
.../tika/parser/pkg/ZipContainerDetectorTest.java | 179 ++++++++++-
.../org/apache/tika/parser/pkg/tika-config.xml | 31 ++
16 files changed, 954 insertions(+), 399 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index c73a3df..af8fcd0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -8,6 +8,13 @@ Release 2.0.0 - ???
Release 1.21 - ????
+ * The ZipContainerDetector's default behavior was changed to run
+ streaming detection up to its markLimit. Users can get the
+ legacy behavior (spool-to-file/rely-on-underlying-file-in-TikaInputStream)
+ by setting markLimit=-1. The POIFSContainerDetector requires an underlying file;
+ it will try to spool the file to disk; if the file's length is > markLimit,
+ it will not attempt detection; set markLimit to -1 for legacy behavior (TIKA-2849).
+
* Upgrade PDFBox to 2.0.14 (TIKA-2834).
* Add CSV detection and replace TXTParser with TextAndCSVParser;
diff --git a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
new file mode 100644
index 0000000..dabedf5
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * Very slight modification of Commons' BoundedInputStream
+ * so that we can figure out if this hit the bound or not.
+ */
+public class BoundedInputStream extends InputStream {
+
+
+ private final static int EOF = -1;
+ private final long max;
+ private final InputStream in;
+ private long pos;
+
+ public BoundedInputStream(long max, InputStream in) {
+ this.max = max;
+ this.in = in;
+ }
+
+ @Override
+ public int read() throws IOException {
+ if (max >= 0 && pos >= max) {
+ return EOF;
+ }
+ final int result = in.read();
+ pos++;
+ return result;
+ }
+
+ /**
+ * Invokes the delegate's <code>read(byte[])</code> method.
+ *
+ * @param b the buffer to read the bytes into
+ * @return the number of bytes read or -1 if the end of stream or
+ * the limit has been reached.
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public int read(final byte[] b) throws IOException {
+ return this.read(b, 0, b.length);
+ }
+
+ /**
+ * Invokes the delegate's <code>read(byte[], int, int)</code> method.
+ *
+ * @param b the buffer to read the bytes into
+ * @param off The start offset
+ * @param len The number of bytes to read
+ * @return the number of bytes read or -1 if the end of stream or
+ * the limit has been reached.
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public int read(final byte[] b, final int off, final int len) throws IOException {
+ if (max >= 0 && pos >= max) {
+ return EOF;
+ }
+ final long maxRead = max >= 0 ? Math.min(len, max - pos) : len;
+ final int bytesRead = in.read(b, off, (int) maxRead);
+
+ if (bytesRead == EOF) {
+ return EOF;
+ }
+
+ pos += bytesRead;
+ return bytesRead;
+ }
+
+ /**
+ * Invokes the delegate's <code>skip(long)</code> method.
+ *
+ * @param n the number of bytes to skip
+ * @return the actual number of bytes skipped
+ * @throws IOException if an I/O error occurs
+ */
+ @Override
+ public long skip(final long n) throws IOException {
+ final long toSkip = max >= 0 ? Math.min(n, max - pos) : n;
+ final long skippedBytes = in.skip(toSkip);
+ pos += skippedBytes;
+ return skippedBytes;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ in.reset();
+ pos = 0;
+ }
+
+ @Override
+ public void mark(int readLimit) {
+ in.mark(readLimit);
+ }
+
+ public boolean hasHitBound() {
+ return pos >= max;
+ }
+}
+
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 96f922f..855ab28 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -623,14 +623,45 @@ public class TikaInputStream extends TaggedInputStream {
return path != null;
}
+
+ /**
+ * If the user created this TikaInputStream with a file,
+ * the original file will be returned. If not, the entire stream
+ * will be spooled to a temporary file which will be deleted
+ * upon the close of this TikaInputStream
+ * @return
+ * @throws IOException
+ */
public Path getPath() throws IOException {
+ return getPath(-1);
+ }
+
+ /**
+ *
+ * @param maxBytes if this is less than 0 and if an underlying file doesn't already exist,
+ * the full file will be spooled to disk
+ * @return the original path used in the initialization of this TikaInputStream,
+ * a temporary file if the stream was shorter than <code>maxBytes</code>, or <code>null</code>
+ * if the underlying stream was longer than maxBytes.
+ * @throws IOException
+ */
+ public Path getPath(int maxBytes) throws IOException {
if (path == null) {
if (position > 0) {
throw new IOException("Stream is already being read");
} else {
- // Spool the entire stream into a temporary file
path = tmp.createTempFile();
- Files.copy(in, path, REPLACE_EXISTING);
+ if (maxBytes > -1) {
+ try (InputStream lookAhead = new LookaheadInputStream(in, maxBytes)) {
+ Files.copy(lookAhead, path, REPLACE_EXISTING);
+ if (Files.size(path) >= maxBytes) {
+ return null;
+ }
+ }
+ } else {
+ // Spool the entire stream into a temporary file
+ Files.copy(in, path, REPLACE_EXISTING);
+ }
// Create a new input stream and make sure it'll get closed
InputStream newStream = Files.newInputStream(path);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
index a208fab..3d3ff17 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
@@ -26,6 +26,7 @@ import java.security.NoSuchAlgorithmException;
import java.security.Provider;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.IOExceptionWithCause;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -126,7 +127,7 @@ public class InputStreamDigester implements DigestingParser.Digester {
//try the usual mark/reset stuff.
//however, if you actually hit the bound,
//then stop and spool to file via TikaInputStream
- SimpleBoundedInputStream bis = new SimpleBoundedInputStream(markLimit, is);
+ BoundedInputStream bis = new BoundedInputStream(markLimit, is);
boolean finishedStream = false;
bis.mark(markLimit + 1);
finishedStream = digestStream(bis, metadata);
@@ -153,7 +154,6 @@ public class InputStreamDigester implements DigestingParser.Digester {
}
}
-
private String getMetadataKey() {
return TikaCoreProperties.TIKA_META_PREFIX +
"digest" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
@@ -179,8 +179,8 @@ public class InputStreamDigester implements DigestingParser.Digester {
updateDigest(messageDigest, is);
digestBytes = messageDigest.digest();
- if (is instanceof SimpleBoundedInputStream) {
- if (((SimpleBoundedInputStream) is).hasHitBound()) {
+ if (is instanceof BoundedInputStream) {
+ if (((BoundedInputStream) is).hasHitBound()) {
return false;
}
}
@@ -202,99 +202,4 @@ public class InputStreamDigester implements DigestingParser.Digester {
return digest;
}
-
- /**
- * Very slight modification of Commons' BoundedInputStream
- * so that we can figure out if this hit the bound or not.
- */
- private static class SimpleBoundedInputStream extends InputStream {
- private final static int EOF = -1;
- private final long max;
- private final InputStream in;
- private long pos;
-
- private SimpleBoundedInputStream(long max, InputStream in) {
- this.max = max;
- this.in = in;
- }
-
- @Override
- public int read() throws IOException {
- if (max >= 0 && pos >= max) {
- return EOF;
- }
- final int result = in.read();
- pos++;
- return result;
- }
-
- /**
- * Invokes the delegate's <code>read(byte[])</code> method.
- *
- * @param b the buffer to read the bytes into
- * @return the number of bytes read or -1 if the end of stream or
- * the limit has been reached.
- * @throws IOException if an I/O error occurs
- */
- @Override
- public int read(final byte[] b) throws IOException {
- return this.read(b, 0, b.length);
- }
-
- /**
- * Invokes the delegate's <code>read(byte[], int, int)</code> method.
- *
- * @param b the buffer to read the bytes into
- * @param off The start offset
- * @param len The number of bytes to read
- * @return the number of bytes read or -1 if the end of stream or
- * the limit has been reached.
- * @throws IOException if an I/O error occurs
- */
- @Override
- public int read(final byte[] b, final int off, final int len) throws IOException {
- if (max >= 0 && pos >= max) {
- return EOF;
- }
- final long maxRead = max >= 0 ? Math.min(len, max - pos) : len;
- final int bytesRead = in.read(b, off, (int) maxRead);
-
- if (bytesRead == EOF) {
- return EOF;
- }
-
- pos += bytesRead;
- return bytesRead;
- }
-
- /**
- * Invokes the delegate's <code>skip(long)</code> method.
- *
- * @param n the number of bytes to skip
- * @return the actual number of bytes skipped
- * @throws IOException if an I/O error occurs
- */
- @Override
- public long skip(final long n) throws IOException {
- final long toSkip = max >= 0 ? Math.min(n, max - pos) : n;
- final long skippedBytes = in.skip(toSkip);
- pos += skippedBytes;
- return skippedBytes;
- }
-
- @Override
- public void reset() throws IOException {
- in.reset();
- pos = 0;
- }
-
- @Override
- public void mark(int readLimit) {
- in.mark(readLimit);
- }
-
- public boolean hasHitBound() {
- return pos >= max;
- }
- }
}
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 00d8600..0aaaf35 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -394,7 +394,7 @@ public abstract class TikaTest {
IOUtils.copy(is, bos);
}
if (truncatedLength > bos.toByteArray().length) {
- throw new EOFException("Can't truncate beyond file length");
+ throw new EOFException("Can't truncate beyond file length: "+bos.toByteArray().length);
}
byte[] truncated = new byte[truncatedLength];
System.arraycopy(bos.toByteArray(), 0, truncated, 0, truncatedLength);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index df5b221..49019b6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -175,6 +175,12 @@ public class EpubParser extends AbstractParser {
TemporaryResources temporaryResources = null;
if (TikaInputStream.isTikaInputStream(stream)) {
tis = TikaInputStream.cast(stream);
+ if (tis.getOpenContainer() instanceof ZipFile) {
+ bufferedParseZipFile(
+ (ZipFile)tis.getOpenContainer(),
+ bodyHandler, xhtml, metadata, context, true);
+ return;
+ }
} else {
temporaryResources = new TemporaryResources();
tis = TikaInputStream.get(new CloseShieldInputStream(stream), temporaryResources);
@@ -192,7 +198,11 @@ public class EpubParser extends AbstractParser {
tis.close();
}
}
- bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true);
+ try {
+ bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true);
+ } finally {
+ zipFile.close();
+ }
}
private void trySalvage(Path brokenZip, ContentHandler bodyHandler,
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 5d8f01a..2ffbf56 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -119,7 +119,7 @@ public class IWorkPackageParser extends AbstractParser {
return detectType(zip);
}
- private static IWORKDocumentType detectType(InputStream stream) {
+ public static IWORKDocumentType detectType(InputStream stream) {
QName qname = new XmlRootExtractor().extractRootElement(stream);
if (qname != null) {
String uri = qname.getNamespaceURI();
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
index 6f32984..576cf52 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -16,13 +16,13 @@
*/
package org.apache.tika.parser.microsoft;
-import static org.apache.tika.mime.MediaType.OCTET_STREAM;
import static org.apache.tika.mime.MediaType.application;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
@@ -35,7 +35,9 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.DocumentNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.config.Field;
import org.apache.tika.detect.Detector;
+import org.apache.tika.io.LookaheadInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -175,6 +177,13 @@ public class POIFSContainerDetector implements Detector {
*/
private static final Pattern mppDataMatch = Pattern.compile("\\s\\s\\s\\d+");
+ @Field
+ private int markLimit = 16 * 1024 * 1024;
+
+ public void setMarkLimit(int markLimit) {
+ this.markLimit = markLimit;
+ }
+
/**
* Internal detection of the specific kind of OLE2 document, based on the
* names of the top level streams within the file.
@@ -379,14 +388,19 @@ public class POIFSContainerDetector implements Detector {
return false;
}
- private static Set<String> getTopLevelNames(TikaInputStream stream)
+ private Set<String> getTopLevelNames(TikaInputStream stream)
throws IOException {
// Force the document stream to a (possibly temporary) file
// so we don't modify the current position of the stream
- File file = stream.getFile();
+ Path file = stream.getPath(markLimit);
+
+ //if the stream was longer than markLimit, don't detect
+ if (file == null) {
+ return Collections.emptySet();
+ }
try {
- POIFSFileSystem fs = new POIFSFileSystem(file, true);
+ POIFSFileSystem fs = new POIFSFileSystem(file.toFile(), true);
// Optimize a possible later parsing process by keeping
// a reference to the already opened POI file system
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 017469b..4ac436c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@@ -55,6 +56,7 @@ import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtra
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.tika.parser.pkg.ZipContainerDetector;
import org.apache.tika.parser.utils.ZipSalvager;
+import org.apache.tika.utils.RereadableInputStream;
import org.apache.xmlbeans.XmlException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -68,6 +70,7 @@ import org.xml.sax.SAXException;
public class OOXMLExtractorFactory {
private static final Logger LOG = LoggerFactory.getLogger(OOXMLExtractorFactory.class);
+ private static final int MAX_BUFFER_LENGTH = 1000000;
public static void parse(
InputStream stream, ContentHandler baseHandler,
@@ -92,20 +95,45 @@ public class OOXMLExtractorFactory {
try {
pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
} catch (InvalidOperationException e) {
- tmpRepairedCopy = File.createTempFile("tika-ooxml-repair", "");
+ tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
ZipSalvager.salvageCopy(tis.getFile(), tmpRepairedCopy);
pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
}
tis.setOpenContainer(pkg);
} else {
- InputStream shield = new CloseShieldInputStream(stream);
- pkg = OPCPackage.open(shield);
+ //OPCPackage slurps rris into memory so we can close rris
+ //without apparent problems
+ try (RereadableInputStream rereadableInputStream =
+ new RereadableInputStream(stream, MAX_BUFFER_LENGTH,
+ true, false)) {
+ try {
+ pkg = OPCPackage.open(rereadableInputStream);
+ } catch (EOFException e) {
+ rereadableInputStream.rewind();
+ tmpRepairedCopy = File.createTempFile("tika-ooxml-repair-", "");
+ ZipSalvager.salvageCopy(rereadableInputStream, tmpRepairedCopy);
+ //if there isn't enough left to be opened as a package
+ //throw an exception -- we may want to fall back to streaming
+ //parsing
+ pkg = OPCPackage.open(tmpRepairedCopy, PackageAccess.READ);
+ }
+ }
+ }
+
+ MediaType type = null;
+ String mediaTypeString = metadata.get(Metadata.CONTENT_TYPE);
+ if (mediaTypeString != null) {
+ type = MediaType.parse(mediaTypeString);
+ }
+ if (type != null && OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
+ // Not a supported type, delegate to Empty Parser
+ EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
+ return;
}
- // Get the type, and ensure it's one we handle
- MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
- if (type == null) {
- type = ZipContainerDetector.detectXPSOPC(pkg);
+ if (type == null || ! OOXMLParser.SUPPORTED_TYPES.contains(type)) {
+ // Get the type, and ensure it's one we handle
+ type = ZipContainerDetector.detectOfficeOpenXML(pkg);
}
if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
new file mode 100644
index 0000000..61db730
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/StreamingZipContainerDetector.java
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+import org.apache.commons.io.IOUtils;
+import org.apache.poi.xslf.usermodel.XSLFRelation;
+import org.apache.poi.xssf.usermodel.XSSFRelation;
+import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.iwork.IWorkPackageParser;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+class StreamingZipContainerDetector extends ZipContainerDetectorBase {
+
+ static Map<String, MediaType> OOXML_CONTENT_TYPES = new ConcurrentHashMap<>();
+ static {
+ OOXML_CONTENT_TYPES.put(XWPFRelation.DOCUMENT.getContentType(), DOCX);
+ OOXML_CONTENT_TYPES.put(XWPFRelation.MACRO_DOCUMENT.getContentType(), DOCM);
+ OOXML_CONTENT_TYPES.put(XWPFRelation.TEMPLATE.getContentType(), DOTX);
+
+ OOXML_CONTENT_TYPES.put(XSSFRelation.WORKBOOK.getContentType(), XLSX);
+ OOXML_CONTENT_TYPES.put(XSSFRelation.MACROS_WORKBOOK.getContentType(), XLSM);
+ OOXML_CONTENT_TYPES.put(XSSFRelation.XLSB_BINARY_WORKBOOK.getContentType(), XLSB);
+ OOXML_CONTENT_TYPES.put(XSLFRelation.MAIN.getContentType(), PPTX);
+ OOXML_CONTENT_TYPES.put(XSLFRelation.MACRO.getContentType(), PPSM);
+ OOXML_CONTENT_TYPES.put(XSLFRelation.MACRO_TEMPLATE.getContentType(), POTM);
+ OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(), PPTM);
+ OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML.getContentType(), PPSX);
+ OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATION_MACRO.getContentType(), PPTM);
+ OOXML_CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(), POTX);
+ OOXML_CONTENT_TYPES.put(XSLFRelation.THEME_MANAGER.getContentType(), THMX);
+ OOXML_CONTENT_TYPES.put("application/vnd.ms-package.xps-fixeddocumentsequence+xml", XPS);
+ }
+
+ /**
+ *
+ * @param is inputstream to read from. Callers must mark/reset the stream
+ * before/after this call to detect. This call does not close the stream!
+ * Depending on the file type, this call to detect may read the entire stream.
+ * Make sure to use a {@link org.apache.tika.io.BoundedInputStream} or similar
+ * if you want to protect against reading the entire stream.
+ * @return
+ */
+ static MediaType detect(InputStream is) {
+
+ Set<String> fileNames = new HashSet<>();
+ Set<String> directoryNames = new HashSet<>();
+ try (ZipArchiveInputStream zipArchiveInputStream =
+ new ZipArchiveInputStream(new CloseShieldInputStream(is))) {
+ ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+ while (zae != null) {
+ String name = zae.getName();
+ if (zae.isDirectory()) {
+ directoryNames.add(name);
+ zae = zipArchiveInputStream.getNextZipEntry();
+ continue;
+ }
+ fileNames.add(name);
+ //we could also parse _rel/.rels, but if
+ // there isn't a valid content_types, then POI
+ //will throw an exception...Better to backoff to PKG
+ //than correctly identify a truncated
+ if (name.equals("[Content_Types].xml")) {
+ MediaType mt = parseOOXMLContentTypes(zipArchiveInputStream);
+ if (mt != null) {
+ return mt;
+ }
+ return TIKA_OOXML;
+ } else if (IWorkPackageParser.IWORK_CONTENT_ENTRIES.contains(name)) {
+ IWorkPackageParser.IWORKDocumentType type = IWorkPackageParser.IWORKDocumentType.detectType(zipArchiveInputStream);
+ if (type != null) {
+ return type.getType();
+ }
+ } else if (name.equals("mimetype")) {
+ //odt -- TODO -- bound the read and check that the results are
+ //valid
+ return MediaType.parse(IOUtils.toString(zipArchiveInputStream, UTF_8));
+ }
+ zae = zipArchiveInputStream.getNextZipEntry();
+ }
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ //swallow
+ }
+ //entrynames is the union of directory names and file names
+ Set<String> entryNames = new HashSet<>(fileNames);
+ entryNames.addAll(fileNames);
+ MediaType mt = detectKmz(fileNames);
+ if (mt != null) {
+ return mt;
+ }
+ mt = detectJar(entryNames);
+ if (mt != null) {
+ return mt;
+ }
+ mt = detectIpa(entryNames);
+ if (mt != null) {
+ return mt;
+ }
+ mt = detectIWorks(entryNames);
+ if (mt != null) {
+ return mt;
+ }
+ int hits = 0;
+ for (String s : OOXML_HINTS) {
+ if (entryNames.contains(s)) {
+ if (++hits > 2) {
+ return TIKA_OOXML;
+ }
+ }
+ }
+ return MediaType.APPLICATION_ZIP;
+ }
+
+ private static MediaType detectIWorks(Set<String> entryNames) {
+ //general iworks
+ if (entryNames.contains(IWorkPackageParser.IWORK_COMMON_ENTRY)) {
+ return MediaType.application("vnd.apple.iwork");
+ }
+ return null;
+ }
+
+
+ public static Set<String> parseOOXMLRels(InputStream is) {
+ RelsHandler relsHandler = new RelsHandler();
+ try {
+ XMLReaderUtils.parseSAX(is, relsHandler, new ParseContext());
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+
+ }
+ return relsHandler.rels;
+ }
+
+ private static class RelsHandler extends DefaultHandler {
+ Set<String> rels = new HashSet<>();
+ private MediaType mediaType = null;
+ @Override
+ public void startElement(String uri, String localName,
+ String name, Attributes attrs) throws SAXException {
+ for (int i = 0; i < attrs.getLength(); i++) {
+ String attrName = attrs.getLocalName(i);
+ if (attrName.equals("Type")) {
+ String contentType = attrs.getValue(i);
+ rels.add(contentType);
+ if (OOXML_CONTENT_TYPES.containsKey(contentType)) {
+ mediaType = OOXML_CONTENT_TYPES.get(contentType);
+ }
+ }
+ }
+ }
+ }
+
+ public static MediaType parseOOXMLContentTypes(InputStream is) {
+ ContentTypeHandler contentTypeHandler = new ContentTypeHandler();
+ try {
+ XMLReaderUtils.parseSAX(is, contentTypeHandler, new ParseContext());
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+
+ }
+ return contentTypeHandler.mediaType;
+ }
+
+
+ private static class ContentTypeHandler extends DefaultHandler {
+
+ private MediaType mediaType = null;
+
+ @Override
+ public void startElement(String uri, String localName,
+ String name, Attributes attrs) throws SAXException {
+ for (int i = 0; i < attrs.getLength(); i++) {
+ String attrName = attrs.getLocalName(i);
+ if (attrName.equals("ContentType")) {
+ String contentType = attrs.getValue(i);
+ if (OOXML_CONTENT_TYPES.containsKey(contentType)) {
+ mediaType = OOXML_CONTENT_TYPES.get(contentType);
+ throw new StoppingEarlyException();
+ }
+
+ }
+ }
+ }
+ }
+
+ private static class StoppingEarlyException extends SAXException {
+
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
index 3f2303b..7007b7c6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetector.java
@@ -16,10 +16,22 @@
*/
package org.apache.tika.parser.pkg;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.Set;
+import java.util.regex.Pattern;
+
import org.apache.commons.compress.archivers.ArchiveException;
import org.apache.commons.compress.archivers.ArchiveStreamFactory;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
-import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.compress.compressors.CompressorException;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
@@ -31,39 +43,16 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
import org.apache.poi.openxml4j.util.ZipEntrySource;
import org.apache.poi.openxml4j.util.ZipFileZipEntrySource;
-import org.apache.poi.xslf.usermodel.XSLFRelation;
-import org.apache.poi.xssf.usermodel.XSSFRelation;
-import org.apache.poi.xwpf.usermodel.XWPFRelation;
+import org.apache.tika.config.Field;
import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.BoundedInputStream;
+import org.apache.tika.io.LookaheadInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.iwork.IWorkPackageParser;
import org.apache.tika.parser.iwork.IWorkPackageParser.IWORKDocumentType;
import org.apache.tika.parser.iwork.iwana.IWork13PackageParser;
-import org.apache.tika.utils.XMLReaderUtils;
-import org.xml.sax.Attributes;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import java.io.ByteArrayInputStream;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.regex.Pattern;
-
-import static java.nio.charset.StandardCharsets.UTF_8;
/**
* A detector that works on Zip documents and other archive and compression
@@ -95,45 +84,15 @@ public class ZipContainerDetector implements Detector {
private static final String XPS_DOCUMENT =
"http://schemas.microsoft.com/xps/2005/06/fixedrepresentation";
- private static final MediaType TIKA_OOXML = MediaType.application("x-tika-ooxml");
- private static final MediaType DOCX =
- MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
- private static final MediaType DOCM =
- MediaType.application("vnd.ms-word.document.macroEnabled.12");
- private static final MediaType DOTX =
- MediaType.application("vnd.ms-word.document.macroEnabled.12");
- private static final MediaType PPTX =
- MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
- private static final MediaType PPTM =
- MediaType.application("vnd.ms-powerpoint.presentation.macroEnabled.12");
- private static final MediaType POTX =
- MediaType.application("vnd.openxmlformats-officedocument.presentationml.template");
- private static final MediaType XLSX =
- MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- private static final MediaType XLSM =
- MediaType.application("vnd.ms-excel.sheet.macroEnabled.12");
-
- private static final Set<String> OOXML_HINTS = fillSet(
- "word/document.xml",
- "_rels/.rels",
- "[Content_Types].xml",
- "ppt/presentation.xml",
- "ppt/slides/slide1.xml",
- "xl/workbook.xml",
- "xl/sharedStrings.xml",
- "xl/worksheets/sheet1.xml"
- );
-
- static Set<String> fillSet(String ... args) {
- Set<String> tmp = new HashSet<>();
- for (String arg : args) {
- tmp.add(arg);
- }
- return Collections.unmodifiableSet(tmp);
- }
+
/** Serial version UID */
private static final long serialVersionUID = 2891763938430295453L;
+ //this has to be > 100,000 to handle some of the iworks files
+ //in our unit tests
+ @Field
+ int markLimit = 16 * 1024 * 1024;
+
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
// Check if we have access to the document
@@ -141,34 +100,54 @@ public class ZipContainerDetector implements Detector {
return MediaType.OCTET_STREAM;
}
- TemporaryResources tmp = new TemporaryResources();
+ byte[] prefix = new byte[1024]; // enough for all known archive formats
+ input.mark(1024);
+ int length = -1;
try {
- TikaInputStream tis = TikaInputStream.get(input, tmp);
+ length = IOUtils.read(input, prefix);
+ } finally {
+ input.reset();
+ }
- byte[] prefix = new byte[1024]; // enough for all known formats
- int length = tis.peek(prefix);
+ MediaType type = detectArchiveFormat(prefix, length);
- MediaType type = detectArchiveFormat(prefix, length);
+ if (type == TIFF) {
+ return TIFF;
+ } else if (PackageParser.isZipArchive(type)) {
- if (type == TIFF) {
- return TIFF;
- } else if (PackageParser.isZipArchive(type)
- && TikaInputStream.isTikaInputStream(input)) {
- return detectZipFormat(tis);
- } else if (!type.equals(MediaType.OCTET_STREAM)) {
- return type;
- } else {
- return detectCompressorFormat(prefix, length);
+ if (TikaInputStream.isTikaInputStream(input)) {
+ TikaInputStream tis = TikaInputStream.cast(input);
+ if (markLimit < 0) {
+ tis.getFile();
+ }
+ if (tis.hasFile()) {
+ return detectZipFormatOnFile(tis);
+ }
}
- } finally {
- try {
- tmp.dispose();
- } catch (TikaException e) {
- // ignore
+
+ try (LookaheadInputStream lookahead = new LookaheadInputStream(input, markLimit)) {
+ return StreamingZipContainerDetector.detect(lookahead);
}
+ } else if (!type.equals(MediaType.OCTET_STREAM)) {
+ return type;
+ } else {
+ return detectCompressorFormat(prefix, length);
}
}
+ /**
+ * If this is less than 0, the file will be spooled to disk,
+ * and detection will run on the full file.
+ * If this is greater than 0, the {@link StreamingZipContainerDetector}
+ * will be called only up to the markLimit.
+ *
+ * @param markLimit mark limit for streaming detection
+ */
+ public void setMarkLimit(int markLimit) {
+ this.markLimit = markLimit;
+ }
+
+
private static MediaType detectCompressorFormat(byte[] prefix, int length) {
try {
String type = CompressorStreamFactory.detect(new ByteArrayInputStream(prefix, 0, length));
@@ -211,17 +190,18 @@ public class ZipContainerDetector implements Detector {
}
}
- private static MediaType detectZipFormat(TikaInputStream tis) {
+ /**
+ * This will call TikaInputStream's getFile(). If there are no exceptions,
+ * it will place the ZipFile in TikaInputStream's openContainer and leave it
+ * open.
+ * @param tis
+ * @return
+ */
+ private static MediaType detectZipFormatOnFile(TikaInputStream tis) {
try {
- //try opc first because opening a package
- //will not necessarily throw an exception for
- //truncated files.
- MediaType type = detectOPCBased(tis);
- if (type != null) {
- return type;
- }
ZipFile zip = new ZipFile(tis.getFile()); // TODO: hasFile()?
+ MediaType type = null;
try {
type = detectOpenDocument(zip);
@@ -244,14 +224,17 @@ public class ZipContainerDetector implements Detector {
return type;
}
} finally {
- // TODO: shouldn't we record the open
- // container so it can be later
- // reused...?
- // tis.setOpenContainer(zip);
- try {
- zip.close();
- } catch (IOException e) {
- // ignore
+ tis.setOpenContainer(zip);
+ }
+ //finally, test for opc based
+ //if it is not an opc based file, poi throws an exception
+ //and we close the zip
+ //if it is opc based, we put the pkg in TikaInputStream's open container
+ if (zip.getEntry("_rels/.rels") != null
+ || zip.getEntry("[Content_Types].xml") != null) {
+ type = detectOPCBased(zip, tis);
+ if (type != null) {
+ return type;
}
}
} catch (IOException e) {
@@ -281,57 +264,32 @@ public class ZipContainerDetector implements Detector {
}
}
- private static MediaType detectOPCBased(TikaInputStream stream) {
+ //If this is not an OPCBased file, POI throws an exception and we close the zipFile.
+ private static MediaType detectOPCBased(ZipFile zipFile, TikaInputStream stream) {
+ //as of 4.x, POI throws an exception for non-POI OPC file types
+ //unless we change POI, we can't rely on POI for non-POI files
+ ZipEntrySource zipEntrySource = new ZipFileZipEntrySource(zipFile);
- ZipEntrySource zipEntrySource = null;
- try {
- zipEntrySource = new ZipFileZipEntrySource(new ZipFile(stream.getFile()));
- } catch (IOException e) {
- return tryStreamingDetection(stream);
- }
-
- //if (zip.getEntry("_rels/.rels") != null
- // || zip.getEntry("[Content_Types].xml") != null) {
// Use POI to open and investigate it for us
//Unfortunately, POI can throw a RuntimeException...so we
//have to catch that.
OPCPackage pkg = null;
- try {
- pkg = OPCPackage.open(zipEntrySource);
- } catch (SecurityException e) {
- closeQuietly(zipEntrySource);
- //TIKA-2571
- throw e;
- } catch (InvalidFormatException|RuntimeException e) {
- closeQuietly(zipEntrySource);
- return null;
- }
-
MediaType type = null;
try {
-
- // Is at an OOXML format?
+ pkg = OPCPackage.open(zipEntrySource);
type = detectOfficeOpenXML(pkg);
- if (type == null) {
- // Is it XPS format?
- type = detectXPSOPC(pkg);
- }
- if (type == null) {
- // Is it an AutoCAD format?
- type = detectAutoCADOPC(pkg);
- }
-
} catch (SecurityException e) {
closeQuietly(zipEntrySource);
+ IOUtils.closeQuietly(zipFile);
//TIKA-2571
throw e;
- } catch (RuntimeException e) {
+ } catch (InvalidFormatException|RuntimeException e) {
closeQuietly(zipEntrySource);
+ IOUtils.closeQuietly(zipFile);
return null;
}
//only set the open container if we made it here
stream.setOpenContainer(pkg);
- // We don't know what it is, sorry
return type;
}
@@ -360,7 +318,19 @@ public class ZipContainerDetector implements Detector {
if (core.size() == 0) {
core = pkg.getRelationshipsByType(VISIO_DOCUMENT);
}
-
+ if (core.size() == 0) {
+ core = pkg.getRelationshipsByType(XPS_DOCUMENT);
+ if (core.size() == 1) {
+ return MediaType.application("vnd.ms-xpsdocument");
+ }
+ }
+
+ if (core.size() == 0) {
+ core = pkg.getRelationshipsByType("http://schemas.autodesk.com/dwfx/2007/relationships/documentsequence");
+ if (core.size() == 1) {
+ return MediaType.parse("model/vnd.dwfx+xps");
+ }
+ }
// If we didn't find a single core document of any type, skip detection
if (core.size() != 1) {
// Invalid OOXML Package received
@@ -389,19 +359,7 @@ public class ZipContainerDetector implements Detector {
// Build the MediaType object and return
return MediaType.parse(docType);
}
- /**
- * Detects Open XML Paper Specification (XPS)
- */
- public static MediaType detectXPSOPC(OPCPackage pkg) {
- PackageRelationshipCollection xps =
- pkg.getRelationshipsByType("http://schemas.microsoft.com/xps/2005/06/fixedrepresentation");
- if (xps.size() == 1) {
- return MediaType.application("vnd.ms-xpsdocument");
- } else {
- // Non-XPS Package received
- return null;
- }
- }
+
/**
* Detects AutoCAD formats that live in OPC packaging
*/
@@ -534,95 +492,5 @@ public class ZipContainerDetector implements Detector {
return null;
}
- private static MediaType tryStreamingDetection(TikaInputStream stream) {
- Set<String> entryNames = new HashSet<>();
- try (InputStream is = new FileInputStream(stream.getFile())) {
- ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
- ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
- while (zae != null) {
- if (zae.isDirectory()) {
- zae = zipArchiveInputStream.getNextZipEntry();
- continue;
- }
- entryNames.add(zae.getName());
- //we could also parse _rel/.rels, but if
- // there isn't a valid content_types, then POI
- //will throw an exception...Better to backoff to PKG
- //than correctly identify a truncated
- if (zae.getName().equals("[Content_Types].xml")) {
- MediaType mt = parseContentTypes(zipArchiveInputStream);
- if (mt != null) {
- return mt;
- }
- return TIKA_OOXML;
- }
- zae = zipArchiveInputStream.getNextZipEntry();
- }
- } catch (SecurityException e) {
- throw e;
- } catch (Exception e) {
- //swallow
- }
- int hits = 0;
- for (String s : OOXML_HINTS) {
- if (entryNames.contains(s)) {
- hits++;
- }
- }
- if (hits > 2) {
- return TIKA_OOXML;
- }
- return MediaType.APPLICATION_ZIP;
- }
-
- private static MediaType parseContentTypes(InputStream is) {
- ContentTypeHandler contentTypeHandler = new ContentTypeHandler();
- try {
- XMLReaderUtils.parseSAX(is, contentTypeHandler, new ParseContext());
- } catch (SecurityException e) {
- throw e;
- } catch (Exception e) {
-
- }
- return contentTypeHandler.mediaType;
- }
-
-
- private static class ContentTypeHandler extends DefaultHandler {
- static Map<String, MediaType> CONTENT_TYPES = new ConcurrentHashMap<>();
- static {
- CONTENT_TYPES.put(XWPFRelation.DOCUMENT.getContentType(), DOCX);
- CONTENT_TYPES.put(XWPFRelation.MACRO_DOCUMENT.getContentType(), DOCM);
- CONTENT_TYPES.put(XWPFRelation.TEMPLATE.getContentType(), DOTX);
-
- CONTENT_TYPES.put(XSSFRelation.WORKBOOK.getContentType(), XLSX);
- CONTENT_TYPES.put(XSSFRelation.MACROS_WORKBOOK.getContentType(), XLSM);
- CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML.getContentType(), PPTX);
- CONTENT_TYPES.put(XSLFRelation.PRESENTATION_MACRO.getContentType(), PPTM);
- CONTENT_TYPES.put(XSLFRelation.PRESENTATIONML_TEMPLATE.getContentType(), POTX);
- }
-
- private MediaType mediaType = null;
-
- @Override
- public void startElement(String uri, String localName,
- String name, Attributes attrs) throws SAXException {
- for (int i = 0; i < attrs.getLength(); i++) {
- String attrName = attrs.getLocalName(i);
- if (attrName.equals("ContentType")) {
- String contentType = attrs.getValue(i);
- if (CONTENT_TYPES.containsKey(contentType)) {
- mediaType = CONTENT_TYPES.get(contentType);
- throw new StoppingEarlyException();
- }
-
- }
- }
- }
- }
-
- private static class StoppingEarlyException extends SAXException {
-
- }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
new file mode 100644
index 0000000..1e3aa58
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/ZipContainerDetectorBase.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pkg;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.tika.mime.MediaType;
+
+abstract class ZipContainerDetectorBase {
+
+
+ static final MediaType TIKA_OOXML = MediaType.application("x-tika-ooxml");
+ static final MediaType DOCX =
+ MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+ static final MediaType DOCM =
+ MediaType.application("vnd.ms-word.document.macroEnabled.12");
+ static final MediaType DOTX =
+ MediaType.application("vnd.ms-word.document.macroEnabled.12");
+ static final MediaType PPTX =
+ MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
+ static final MediaType PPSM =
+ MediaType.application("vnd.ms-powerpoint.slideshow.macroEnabled.12");
+ static final MediaType PPSX =
+ MediaType.application("vnd.openxmlformats-officedocument.presentationml.slideshow");
+ static final MediaType PPTM =
+ MediaType.application("vnd.ms-powerpoint.presentation.macroEnabled.12");
+ static final MediaType POTM =
+ MediaType.application("vnd.ms-powerpoint.template.macroenabled.12");
+ static final MediaType POTX =
+ MediaType.application("vnd.openxmlformats-officedocument.presentationml.template");
+ static final MediaType THMX =
+ MediaType.application("vnd.openxmlformats-officedocument");
+ static final MediaType XLSB =
+ MediaType.application("vnd.ms-excel.sheet.binary.macroenabled.12");
+ static final MediaType XLSX =
+ MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ static final MediaType XLSM =
+ MediaType.application("vnd.ms-excel.sheet.macroEnabled.12");
+ static final MediaType XPS =
+ MediaType.application("vnd.ms-xpsdocument");
+
+ static final Set<String> OOXML_HINTS = fillSet(
+ "word/document.xml",
+ "_rels/.rels",
+ "[Content_Types].xml",
+ "ppt/presentation.xml",
+ "ppt/slides/slide1.xml",
+ "xl/workbook.xml",
+ "xl/sharedStrings.xml",
+ "xl/worksheets/sheet1.xml"
+ );
+
+ private static Set<String> fillSet(String ... args) {
+ Set<String> tmp = new HashSet<>();
+ for (String arg : args) {
+ tmp.add(arg);
+ }
+ return Collections.unmodifiableSet(tmp);
+ }
+
+ static MediaType detectJar(Set<String> entryNames) {
+ if (entryNames.contains("META-INF/MANIFEST.MF")) {
+ // It's a Jar file, or something based on Jar
+
+ // Is it an Android APK?
+ if (entryNames.contains("AndroidManifest.xml")) {
+ return MediaType.application("vnd.android.package-archive");
+ }
+
+ // Check for WAR and EAR
+ if (entryNames.contains("WEB-INF/")) {
+ return MediaType.application("x-tika-java-web-archive");
+ }
+ if (entryNames.contains("META-INF/application.xml")) {
+ return MediaType.application("x-tika-java-enterprise-archive");
+ }
+
+ // Looks like a regular Jar Archive
+ return MediaType.application("java-archive");
+ } else {
+ // Some Android APKs miss the default Manifest
+ if (entryNames.contains("AndroidManifest.xml")) {
+ return MediaType.application("vnd.android.package-archive");
+ }
+
+ return null;
+ }
+ }
+
+ static MediaType detectKmz(Set<String> entryFileNames) {
+ //look for a single kml at the main level
+ boolean kmlFound = false;
+ for (String entryFileName : entryFileNames) {
+ if (entryFileName.indexOf('/') != -1
+ || entryFileName.indexOf('\\') != -1) {
+ continue;
+ }
+ if (entryFileName.endsWith(".kml") && !kmlFound) {
+ kmlFound = true;
+ } else {
+ return null;
+ }
+ }
+ if (kmlFound) {
+ return MediaType.application("vnd.google-earth.kmz");
+ }
+ return null;
+ }
+
+ /**
+ * To be considered as an IPA file, it needs to match all of these
+ */
+ private static HashSet<Pattern> ipaEntryPatterns = new HashSet<Pattern>() {
+ private static final long serialVersionUID = 6545295886322115362L;
+ {
+ add(Pattern.compile("^Payload/$"));
+ add(Pattern.compile("^Payload/.*\\.app/$"));
+ add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/$"));
+ add(Pattern.compile("^Payload/.*\\.app/_CodeSignature/CodeResources$"));
+ add(Pattern.compile("^Payload/.*\\.app/Info\\.plist$"));
+ add(Pattern.compile("^Payload/.*\\.app/PkgInfo$"));
+ }};
+ @SuppressWarnings("unchecked")
+ static MediaType detectIpa(Set<String> entryNames) {
+ // Note - consider generalising this logic, if another format needs many regexp matching
+ Set<Pattern> tmpPatterns = (Set<Pattern>)ipaEntryPatterns.clone();
+
+ for (String entryName : entryNames) {
+ Iterator<Pattern> ip = tmpPatterns.iterator();
+ while (ip.hasNext()) {
+ if (ip.next().matcher(entryName).matches()) {
+ ip.remove();
+ }
+ }
+ if (tmpPatterns.isEmpty()) {
+ // We've found everything we need to find
+ return MediaType.application("x-itunes-ipa");
+ }
+
+ }
+ return null;
+ }
+
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
index 20ebf1b..f7cf08a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/utils/ZipSalvager.java
@@ -18,9 +18,9 @@ package org.apache.tika.parser.utils;
import java.io.EOFException;
import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Files;
import java.util.zip.ZipException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
@@ -38,52 +38,57 @@ public class ZipSalvager {
* This streams the broken zip and rebuilds a new zip that
* is at least a valid zip file. The contents of the final stream
* may be truncated, but the result should be a valid zip file.
- *
+ * <p>
* This does nothing fancy to fix the underlying broken zip.
*
* @param brokenZip
* @param salvagedZip
*/
- public static void salvageCopy(File brokenZip, File salvagedZip) {
+ public static void salvageCopy(InputStream brokenZip, File salvagedZip) {
try (ZipArchiveOutputStream outputStream = new ZipArchiveOutputStream(salvagedZip)) {
- try (InputStream is = new FileInputStream(brokenZip)) {
- ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
- ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
- while (zae != null) {
- try {
- if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
- //create a new ZAE and copy over only the name so that
- //if there is bad info (e.g. CRC) in brokenZip's zae, that
- //won't be propagated or cause an exception
- outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName()));
- //this will copy an incomplete stream...so there
- //could be truncation of the xml/contents, but the zip file
- //should be intact.
- boolean successfullyCopied = false;
- try {
- IOUtils.copy(zipArchiveInputStream, outputStream);
- successfullyCopied = true;
- } catch (IOException e) {
- //this can hit a "truncated ZipFile" IOException
- }
- outputStream.flush();
- outputStream.closeArchiveEntry();
- if (!successfullyCopied) {
- break;
- }
+ ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(brokenZip);
+ ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
+ while (zae != null) {
+ try {
+ if (!zae.isDirectory() && zipArchiveInputStream.canReadEntryData(zae)) {
+ //create a new ZAE and copy over only the name so that
+ //if there is bad info (e.g. CRC) in brokenZip's zae, that
+ //won't be propagated or cause an exception
+ outputStream.putArchiveEntry(new ZipArchiveEntry(zae.getName()));
+ //this will copy an incomplete stream...so there
+ //could be truncation of the xml/contents, but the zip file
+ //should be intact.
+ boolean successfullyCopied = false;
+ try {
+ IOUtils.copy(zipArchiveInputStream, outputStream);
+ successfullyCopied = true;
+ } catch (IOException e) {
+ //this can hit a "truncated ZipFile" IOException
+ }
+ outputStream.flush();
+ outputStream.closeArchiveEntry();
+ if (!successfullyCopied) {
+ break;
}
- zae = zipArchiveInputStream.getNextZipEntry();
- } catch (ZipException|EOFException e) {
- break;
}
-
+ zae = zipArchiveInputStream.getNextZipEntry();
+ } catch (ZipException | EOFException e) {
+ break;
}
- outputStream.flush();
- outputStream.finish();
- outputStream.close();
+
}
+ outputStream.flush();
+ outputStream.finish();
+
+
} catch (IOException e) {
LOG.warn("problem fixing zip", e);
}
}
+
+ public static void salvageCopy(File brokenZip, File salvagedZip) throws IOException {
+ try (InputStream is = Files.newInputStream(brokenZip.toPath())) {
+ salvageCopy(is, salvagedZip);
+ }
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
index 1cf1874..1247cc1 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java
@@ -38,6 +38,7 @@ import java.nio.file.Paths;
import java.util.List;
import java.util.Random;
+import static junit.framework.TestCase.assertTrue;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
index 2865442..0e3ff97 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pkg/ZipContainerDetectorTest.java
@@ -18,38 +18,191 @@
package org.apache.tika.parser.pkg;
-import org.apache.commons.compress.compressors.CompressorStreamFactory;
-import org.apache.tika.TikaTest;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.junit.BeforeClass;
-import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.InputStream;
+import java.nio.file.Paths;
+import java.util.ArrayList;
import java.util.HashSet;
+import java.util.List;
import java.util.Set;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.fail;
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.odf.ODFParserTest;
+import org.junit.Ignore;
+import org.junit.Test;
public class ZipContainerDetectorTest extends TikaTest {
+ private static MediaType ODT_TEXT = MediaType.application("vnd.oasis.opendocument.text");
+ private static MediaType TIFF = MediaType.image("tiff");
+ ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
@Test
public void testTiffWorkaround() throws Exception {
//TIKA-2591
- ZipContainerDetector zipContainerDetector = new ZipContainerDetector();
Metadata metadata = new Metadata();
try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF.tif"))) {
MediaType mt = zipContainerDetector.detect(is, metadata);
- assertEquals(MediaType.image("tiff"), mt);
+ assertEquals(TIFF, mt);
}
metadata = new Metadata();
try (InputStream is = TikaInputStream.get(getResourceAsStream("/test-documents/testTIFF_multipage.tif"))) {
MediaType mt = zipContainerDetector.detect(is, metadata);
- assertEquals(MediaType.image("tiff"), mt);
+ assertEquals(TIFF, mt);
+ }
+ }
+
+ @Test
+ public void testODT() throws Exception {
+ try (InputStream input = ODFParserTest.class.getResourceAsStream(
+ "/test-documents/testODFwithOOo3.odt")) {
+ Metadata metadata = new Metadata();
+ MediaType mt = zipContainerDetector.detect(input, metadata);
+ assertEquals(ODT_TEXT, mt);
}
+ }
+ @Test
+ public void testIWorks() throws Exception {
+ //have to have marklimit in ZipContainerDetector > 100000 for this to work
+ try (InputStream input = ODFParserTest.class.getResourceAsStream(
+ "/test-documents/testPages.pages")) {
+ Metadata metadata = new Metadata();
+ MediaType mt = zipContainerDetector.detect(input, metadata);
+ assertEquals("application/vnd.apple.pages", mt.toString());
+ }
+
+ InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/pkg/tika-config.xml");
+ assertNotNull(is);
+ TikaConfig tikaConfig = new TikaConfig(is);
+ try (InputStream input = ODFParserTest.class.getResourceAsStream(
+ "/test-documents/testPages.pages")) {
+ Metadata metadata = new Metadata();
+ MediaType mt = tikaConfig.getDetector().detect(input, metadata);
+ assertEquals("application/zip", mt.toString());
+ }
+ }
+
+ @Test
+ public void testXPS() throws Exception {
+ for (String file : new String[]{"testXPS_various.xps", "testPPT.xps"}) {
+ long start = System.currentTimeMillis();
+ try (InputStream input = ODFParserTest.class.getResourceAsStream(
+ "/test-documents/" + file)) {
+ MediaType mediaType = StreamingZipContainerDetector.detect(input);
+ assertEquals(ZipContainerDetectorBase.XPS, mediaType);
+ }
+ try (TikaInputStream input = TikaInputStream.get(Paths.get(ODFParserTest.class.getResource(
+ "/test-documents/" + file).toURI()))) {
+ MediaType mediaType = zipContainerDetector.detect(input, new Metadata());
+ assertEquals(ZipContainerDetectorBase.XPS, mediaType);
+ }
+ }
+ }
+
+ @Ignore("for offline testing")
+ @Test
+ public void timeDetection() throws Exception {
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ Detector detector = config.getDetector();
+ MediaTypeRegistry registry = config.getMediaTypeRegistry();
+ List<File> zips = getTestZipBasedFiles(detector, registry);
+
+ Set<MediaType> mediaTypeSet = new HashSet<>();
+ long nonTikaStream = 0;
+ long tikaStream = 0;
+ long tikaStreamWFile = 0;
+ for (int i = 0; i < 20; i++) {
+ for (File z : zips) {
+ long start = System.currentTimeMillis();
+ try (InputStream is = new BufferedInputStream(new FileInputStream(z))) {
+ MediaType mt = detector.detect(is, new Metadata());
+ mediaTypeSet.add(mt);
+ }
+ nonTikaStream += System.currentTimeMillis()-start;
+
+ start = System.currentTimeMillis();
+ try (InputStream is = TikaInputStream.get(
+ new BufferedInputStream(new FileInputStream(z)))) {
+ MediaType mt = detector.detect(is, new Metadata());
+ mediaTypeSet.add(mt);
+ }
+ tikaStream += System.currentTimeMillis()-start;
+
+ start = System.currentTimeMillis();
+ try (InputStream is = TikaInputStream.get(z)) {
+ MediaType mt = detector.detect(is, new Metadata());
+ mediaTypeSet.add(mt);
+ }
+ tikaStreamWFile += System.currentTimeMillis()-start;
+ }
+ }
+ System.out.println("tika stream: "+tikaStream + "\ntika stream w file: "
+ +tikaStreamWFile + "\nnon tika stream:"+nonTikaStream);
+ }
+
+ @Test
+ @Ignore("to be used for offline timing tests")
+ public void timeParsing() throws Exception {
+ TikaConfig config = TikaConfig.getDefaultConfig();
+ Detector detector = config.getDetector();
+ MediaTypeRegistry registry = config.getMediaTypeRegistry();
+
+ List<File> zips = getTestZipBasedFiles(detector, registry);
+ System.out.println("zips size: "+zips.size());
+ Set<MediaType> mediaTypeSet = new HashSet<>();
+ long nonTikaStream = 0;
+ long tikaStream = 0;
+ long tikaStreamWFile = 0;
+ for (int i = 0; i < 10; i++) {
+ for (File z : zips) {
+ long start = System.currentTimeMillis();
+ try (InputStream is = new BufferedInputStream(new FileInputStream(z))) {
+ getRecursiveMetadata(is, true);
+ }
+ nonTikaStream += System.currentTimeMillis()-start;
+ start = System.currentTimeMillis();
+ try (InputStream is = TikaInputStream.get(
+ new BufferedInputStream(new FileInputStream(z)))) {
+ getRecursiveMetadata(is, true);
+ }
+ tikaStream += System.currentTimeMillis()-start;
+ start = System.currentTimeMillis();
+ try (InputStream is = TikaInputStream.get(z)) {
+ getRecursiveMetadata(is, true);
+ }
+ tikaStreamWFile += System.currentTimeMillis()-start;
+
+ }
+ }
+ System.out.println("tika stream: "+tikaStream + "\ntika stream w file: "+tikaStreamWFile + "\nnon tika stream:"+nonTikaStream);
+ }
+
+ //TODO -- we need to find a dwg+xps file for testing
+
+ private List<File> getTestZipBasedFiles(Detector detector, MediaTypeRegistry registry) throws Exception {
+ List<File> zips = new ArrayList<>();
+ for (File f : Paths.get(
+ this.getClass().getResource("/test-documents").toURI()).toFile().listFiles()) {
+ try (InputStream is = TikaInputStream.get(f)) {
+ MediaType mt = detector.detect(is, new Metadata());
+ if (registry.isSpecializationOf(mt, MediaType.APPLICATION_ZIP)) {
+ zips.add(f);
+ }
+ } catch (Exception e) {
+
+ }
+ }
+ return zips;
}
}
\ No newline at end of file
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml
new file mode 100644
index 0000000..97d7c7b
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/pkg/tika-config.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers/>
+ <detectors>
+ <detector class="org.apache.tika.detect.DefaultDetector">
+ <detector-exclude class="org.apache.tika.parser.pkg.ZipContainerDetector"/>
+ </detector>
+ <detector class="org.apache.tika.parser.pkg.ZipContainerDetector">
+ <params>
+ <param name="markLimit" type="int">100000</param>
+ </params>
+ </detector>
+ </detectors>
+ <translator class="org.apache.tika.language.translate.DefaultTranslator"/>
+</properties>
\ No newline at end of file