You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2024/03/20 19:12:50 UTC
(tika) branch branch_2x updated: TIKA-4201: add hard limit to IWorkPackageParser's detect (#1608)
This is an automated email from the ASF dual-hosted git repository.
tilman pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_2x by this push:
new 6eec8c9c7 TIKA-4201: add hard limit to IWorkPackageParser's detect (#1608)
6eec8c9c7 is described below
commit 6eec8c9c78d6198e5af75ec45faa26579726ff30
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Wed Mar 20 20:12:40 2024 +0100
TIKA-4201: add hard limit to IWorkPackageParser's detect (#1608)
---
.../tika/parser/iwork/IWorkPackageParser.java | 38 ++++++++++++++--------
1 file changed, 24 insertions(+), 14 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 637348db5..6c96eea6d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -29,7 +29,9 @@ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -68,6 +70,7 @@ public class IWorkPackageParser extends AbstractParser {
* Serial version UID
*/
private static final long serialVersionUID = -2160322853809682372L;
+ private static final int MARK_LIMIT = 1096;
/**
* This parser handles all iWorks formats.
*/
@@ -93,9 +96,9 @@ public class IWorkPackageParser extends AbstractParser {
continue;
}
- InputStream entryStream = new BufferedInputStream(zip, 9216);
- entryStream.mark(9216);
- IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
+ InputStream entryStream = new BufferedInputStream(zip);
+ entryStream.mark(MARK_LIMIT);
+ IWORKDocumentType type = detectType(entryStream, MARK_LIMIT);
entryStream.reset();
if (type != null) {
@@ -134,6 +137,24 @@ public class IWorkPackageParser extends AbstractParser {
// Don't close the zip InputStream (TIKA-1117).
}
+ private IWORKDocumentType detectType(InputStream entryStream, int markLimit) throws IOException {
+ byte[] bytes = new byte[markLimit];
+ try {
+ int read = IOUtils.read(entryStream, bytes, 0, markLimit);
+ try (InputStream bis = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes)
+ .setOffset(0).setLength(read).get()) {
+ return IWORKDocumentType.detectType(bis);
+ }
+ } catch (UnsupportedZipFeatureException e) {
+ // There was a problem with extracting the root type
+ // Password Protected iWorks files are funny, but we can usually
+ // spot them because they encrypt part of the zip stream
+
+ // Compression field was likely encrypted
+ return IWORKDocumentType.ENCRYPTED;
+ }
+ }
+
public enum IWORKDocumentType {
KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation",
MediaType.application("vnd.apple.keynote")),
@@ -191,17 +212,6 @@ public class IWorkPackageParser extends AbstractParser {
return type;
}
}
- } else {
- // There was a problem with extracting the root type
- // Password Protected iWorks files are funny, but we can usually
- // spot them because they encrypt part of the zip stream
- try {
- stream.read();
- } catch (UnsupportedZipFeatureException e) {
- // Compression field was likely encrypted
- return ENCRYPTED;
- } catch (Exception ignored) {
- }
}
return null;
}