You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ti...@apache.org on 2024/03/20 19:12:50 UTC

(tika) branch branch_2x updated: TIKA-4201: add hard limit to IWorkPackageParser's detect (#1608)

This is an automated email from the ASF dual-hosted git repository.

tilman pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_2x by this push:
     new 6eec8c9c7 TIKA-4201: add hard limit to IWorkPackageParser's detect (#1608)
6eec8c9c7 is described below

commit 6eec8c9c78d6198e5af75ec45faa26579726ff30
Author: Tilman Hausherr <ti...@apache.org>
AuthorDate: Wed Mar 20 20:12:40 2024 +0100

    TIKA-4201: add hard limit to IWorkPackageParser's detect (#1608)
---
 .../tika/parser/iwork/IWorkPackageParser.java      | 38 ++++++++++++++--------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 637348db5..6c96eea6d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -29,7 +29,9 @@ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -68,6 +70,7 @@ public class IWorkPackageParser extends AbstractParser {
      * Serial version UID
      */
     private static final long serialVersionUID = -2160322853809682372L;
+    private static final int MARK_LIMIT = 1096;
     /**
      * This parser handles all iWorks formats.
      */
@@ -93,9 +96,9 @@ public class IWorkPackageParser extends AbstractParser {
                 continue;
             }
 
-            InputStream entryStream = new BufferedInputStream(zip, 9216);
-            entryStream.mark(9216);
-            IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
+            InputStream entryStream = new BufferedInputStream(zip);
+            entryStream.mark(MARK_LIMIT);
+            IWORKDocumentType type = detectType(entryStream, MARK_LIMIT);
             entryStream.reset();
 
             if (type != null) {
@@ -134,6 +137,24 @@ public class IWorkPackageParser extends AbstractParser {
         // Don't close the zip InputStream (TIKA-1117).
     }
 
+    private IWORKDocumentType detectType(InputStream entryStream, int markLimit) throws IOException {
+        byte[] bytes = new byte[markLimit];
+        try {
+            int read = IOUtils.read(entryStream, bytes, 0, markLimit);
+            try (InputStream bis = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes)
+                    .setOffset(0).setLength(read).get()) {
+                return IWORKDocumentType.detectType(bis);
+            }
+        } catch (UnsupportedZipFeatureException e) {
+            // There was a problem with extracting the root type
+            // Password Protected iWorks files are funny, but we can usually
+            //  spot them because they encrypt part of the zip stream
+
+            // Compression field was likely encrypted
+            return IWORKDocumentType.ENCRYPTED;
+        }
+    }
+
     public enum IWORKDocumentType {
         KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation",
                 MediaType.application("vnd.apple.keynote")),
@@ -191,17 +212,6 @@ public class IWorkPackageParser extends AbstractParser {
                         return type;
                     }
                 }
-            } else {
-                // There was a problem with extracting the root type
-                // Password Protected iWorks files are funny, but we can usually
-                //  spot them because they encrypt part of the zip stream
-                try {
-                    stream.read();
-                } catch (UnsupportedZipFeatureException e) {
-                    // Compression field was likely encrypted
-                    return ENCRYPTED;
-                } catch (Exception ignored) {
-                }
             }
             return null;
         }