You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/20 15:58:54 UTC

(tika) branch TIKA-4201 created (now e3e325169)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4201
in repository https://gitbox.apache.org/repos/asf/tika.git


      at e3e325169 TIKA-4201 -- add hard limit to IWorkPackageParser's detect

This branch includes the following new commits:

     new e3e325169 TIKA-4201 -- add hard limit to IWorkPackageParser's detect

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



(tika) 01/01: TIKA-4201 -- add hard limit to IWorkPackageParser's detect

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4201
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e3e325169abcb130689acd13b9cf286040f66cab
Author: tallison <ta...@apache.org>
AuthorDate: Tue Feb 20 10:58:36 2024 -0500

    TIKA-4201 -- add hard limit to IWorkPackageParser's detect
---
 .../tika/parser/iwork/IWorkPackageParser.java      | 39 ++++++++++++++--------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
index 490cfe47e..87074304f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
@@ -29,7 +29,9 @@ import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipFile;
+import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -68,6 +70,7 @@ public class IWorkPackageParser implements Parser {
      * Serial version UID
      */
     private static final long serialVersionUID = -2160322853809682372L;
+    private static final int MARK_LIMIT = 1096;
     /**
      * This parser handles all iWorks formats.
      */
@@ -91,9 +94,9 @@ public class IWorkPackageParser implements Parser {
                 continue;
             }
 
-            InputStream entryStream = new BufferedInputStream(zip, 9216);
-            entryStream.mark(9216);
-            IWORKDocumentType type = IWORKDocumentType.detectType(entryStream);
+            InputStream entryStream = new BufferedInputStream(zip);
+            entryStream.mark(MARK_LIMIT);
+            IWORKDocumentType type = detectType(entryStream, MARK_LIMIT);
             entryStream.reset(); // 4096 fails on github
 
             if (type != null) {
@@ -132,6 +135,25 @@ public class IWorkPackageParser implements Parser {
         // Don't close the zip InputStream (TIKA-1117).
     }
 
+    private IWORKDocumentType detectType(InputStream entryStream, int markLimit) throws IOException {
+        byte[] bytes = new byte[markLimit];
+        try {
+            int read = IOUtils.read(entryStream, bytes, 0, markLimit);
+            try (InputStream bis = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes)
+                    .setOffset(0).setLength(read).get()) {
+                return IWORKDocumentType.detectType(bis);
+            }
+        } catch (UnsupportedZipFeatureException e) {
+            // There was a problem with extracting the root type
+            // Password Protected iWorks files are funny, but we can usually
+            //  spot them because they encrypt part of the zip stream
+
+            // Compression field was likely encrypted
+            return IWORKDocumentType.ENCRYPTED;
+        }
+
+    }
+
     public enum IWORKDocumentType {
         KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation",
                 MediaType.application("vnd.apple.keynote")),
@@ -189,17 +211,6 @@ public class IWorkPackageParser implements Parser {
                         return type;
                     }
                 }
-            } else {
-                // There was a problem with extracting the root type
-                // Password Protected iWorks files are funny, but we can usually
-                //  spot them because they encrypt part of the zip stream
-                try {
-                    stream.read();
-                } catch (UnsupportedZipFeatureException e) {
-                    // Compression field was likely encrypted
-                    return ENCRYPTED;
-                } catch (Exception ignored) {
-                }
             }
             return null;
         }