You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/20 16:56:26 UTC
(tika) branch main updated: TIKA-4201 -- add hard limit to IWorkPackageParser's detect (#1608)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new fd44840a1 TIKA-4201 -- add hard limit to IWorkPackageParser's detect (#1608)
fd44840a1 is described below
commit fd44840a113b719872df1c453d46775efe850c60
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Feb 20 11:56:20 2024 -0500
TIKA-4201 -- add hard limit to IWorkPackageParser's detect (#1608)
---
.../tika/parser/iwork/IWorkPackageParser.java | 39 ++++++++++++++--------
1 file changed, 25 insertions(+), 14 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 490cfe47e..87074304f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -29,7 +29,9 @@ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -68,6 +70,7 @@ public class IWorkPackageParser implements Parser {
* Serial version UID
*/
private static final long serialVersionUID = -2160322853809682372L;
+ private static final int MARK_LIMIT = 1096;
/**
* This parser handles all iWorks formats.
*/
@@ -91,9 +94,9 @@ public class IWorkPackageParser implements Parser {
continue;
}
- InputStream entryStream = new BufferedInputStream(zip, 9216);
- entryStream.mark(9216);
- IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
+ InputStream entryStream = new BufferedInputStream(zip);
+ entryStream.mark(MARK_LIMIT);
+ IWORKDocumentType type = detectType(entryStream, MARK_LIMIT);
entryStream.reset(); // 4096 fails on github
if (type != null) {
@@ -132,6 +135,25 @@ public class IWorkPackageParser implements Parser {
// Don't close the zip InputStream (TIKA-1117).
}
+ private IWORKDocumentType detectType(InputStream entryStream, int markLimit) throws IOException {
+ byte[] bytes = new byte[markLimit];
+ try {
+ int read = IOUtils.read(entryStream, bytes, 0, markLimit);
+ try (InputStream bis = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes)
+ .setOffset(0).setLength(read).get()) {
+ return IWORKDocumentType.detectType(bis);
+ }
+ } catch (UnsupportedZipFeatureException e) {
+ // There was a problem with extracting the root type
+ // Password Protected iWorks files are funny, but we can usually
+ // spot them because they encrypt part of the zip stream
+
+ // Compression field was likely encrypted
+ return IWORKDocumentType.ENCRYPTED;
+ }
+
+ }
+
public enum IWORKDocumentType {
KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation",
MediaType.application("vnd.apple.keynote")),
@@ -189,17 +211,6 @@ public class IWorkPackageParser implements Parser {
return type;
}
}
- } else {
- // There was a problem with extracting the root type
- // Password Protected iWorks files are funny, but we can usually
- // spot them because they encrypt part of the zip stream
- try {
- stream.read();
- } catch (UnsupportedZipFeatureException e) {
- // Compression field was likely encrypted
- return ENCRYPTED;
- } catch (Exception ignored) {
- }
}
return null;
}