You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/20 16:56:26 UTC

(tika) branch main updated: TIKA-4201 -- add hard limit to IWorkPackageParser's detect (#1608)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new fd44840a1 TIKA-4201 -- add hard limit to IWorkPackageParser's detect (#1608)
fd44840a1 is described below

commit fd44840a113b719872df1c453d46775efe850c60
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Feb 20 11:56:20 2024 -0500

    TIKA-4201 -- add hard limit to IWorkPackageParser's detect (#1608)
---
 .../tika/parser/iwork/IWorkPackageParser.java      | 39 ++++++++++++++--------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 490cfe47e..87074304f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -29,7 +29,9 @@ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -68,6 +70,7 @@ public class IWorkPackageParser implements Parser {
      * Serial version UID
      */
     private static final long serialVersionUID = -2160322853809682372L;
+    private static final int MARK_LIMIT = 1096;
     /**
      * This parser handles all iWorks formats.
      */
@@ -91,9 +94,9 @@ public class IWorkPackageParser implements Parser {
                 continue;
             }
 
-            InputStream entryStream = new BufferedInputStream(zip, 9216);
-            entryStream.mark(9216);
-            IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
+            InputStream entryStream = new BufferedInputStream(zip);
+            entryStream.mark(MARK_LIMIT);
+            IWORKDocumentType type = detectType(entryStream, MARK_LIMIT);
             entryStream.reset(); // 4096 fails on github
 
             if (type != null) {
@@ -132,6 +135,25 @@ public class IWorkPackageParser implements Parser {
         // Don't close the zip InputStream (TIKA-1117).
     }
 
+    private IWORKDocumentType detectType(InputStream entryStream, int markLimit) throws IOException {
+        byte[] bytes = new byte[markLimit];
+        try {
+            int read = IOUtils.read(entryStream, bytes, 0, markLimit);
+            try (InputStream bis = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes)
+                    .setOffset(0).setLength(read).get()) {
+                return IWORKDocumentType.detectType(bis);
+            }
+        } catch (UnsupportedZipFeatureException e) {
+            // There was a problem with extracting the root type
+            // Password Protected iWorks files are funny, but we can usually
+            //  spot them because they encrypt part of the zip stream
+
+            // Compression field was likely encrypted
+            return IWORKDocumentType.ENCRYPTED;
+        }
+
+    }
+
     public enum IWORKDocumentType {
         KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation",
                 MediaType.application("vnd.apple.keynote")),
@@ -189,17 +211,6 @@ public class IWorkPackageParser implements Parser {
                         return type;
                     }
                 }
-            } else {
-                // There was a problem with extracting the root type
-                // Password Protected iWorks files are funny, but we can usually
-                //  spot them because they encrypt part of the zip stream
-                try {
-                    stream.read();
-                } catch (UnsupportedZipFeatureException e) {
-                    // Compression field was likely encrypted
-                    return ENCRYPTED;
-                } catch (Exception ignored) {
-                }
             }
             return null;
         }