You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2012/04/27 19:03:24 UTC

svn commit: r1331503 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/iwork/ tika-parsers/src/test/java/org/apache/tika/parser/iwork/ tika-parsers/src/test/resources/test-documents/

Author: nick
Date: Fri Apr 27 17:03:23 2012
New Revision: 1331503

URL: http://svn.apache.org/viewvc?rev=1331503&view=rev
Log:
TIKA-903 Avoid breaking on Password Protected iWorks files. We can't parse them yet though, as we don't know how the encryption works

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesPwdProtected.pages   (with props)
Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1331503&r1=1331502&r2=1331503&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Fri Apr 27 17:03:23 2012
@@ -641,6 +641,10 @@
     <sub-class-of type="application/vnd.apple.iwork" />
     <glob pattern="*.numbers"/>
   </mime-type>
+  <mime-type type="application/x-tika-iworks-protected">
+    <sub-class-of type="application/vnd.apple.iwork" />
+    <_comment>Password Protected iWorks File</_comment>
+  </mime-type>
 
   <mime-type type="application/vnd.arastra.swi">
     <glob pattern="*.swi"/>

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java?rev=1331503&r1=1331502&r2=1331503&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/iwork/IWorkPackageParser.java Fri Apr 27 17:03:23 2012
@@ -26,6 +26,7 @@ import java.util.Set;
 
 import javax.xml.namespace.QName;
 
+import org.apache.commons.compress.archivers.zip.UnsupportedZipFeatureException;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipFile;
@@ -71,7 +72,8 @@ public class IWorkPackageParser extends 
     public enum IWORKDocumentType {
        KEYNOTE("http://developer.apple.com/namespaces/keynote2", "presentation", MediaType.application("vnd.apple.keynote")),
        NUMBERS("http://developer.apple.com/namespaces/ls", "document", MediaType.application("vnd.apple.numbers")),
-       PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages"));
+       PAGES("http://developer.apple.com/namespaces/sl", "document", MediaType.application("vnd.apple.pages")),
+       ENCRYPTED(null, null, MediaType.application("x-tika-iworks-protected"));
        
        private final String namespace;
        private final String part;
@@ -122,13 +124,26 @@ public class IWorkPackageParser extends 
        
        private static IWORKDocumentType detectType(InputStream stream) {
           QName qname = new XmlRootExtractor().extractRootElement(stream);
-          String uri = qname.getNamespaceURI();
-          String local = qname.getLocalPart();
-         
-          for (IWORKDocumentType type : values()) {
-             if(type.getNamespace().equals(uri) && 
-                type.getPart().equals(local)) {
-                return type;
+          if (qname != null) {
+             String uri = qname.getNamespaceURI();
+             String local = qname.getLocalPart();
+            
+             for (IWORKDocumentType type : values()) {
+                if(type.getNamespace().equals(uri) && 
+                   type.getPart().equals(local)) {
+                   return type;
+                }
+             }
+          } else {
+             // There was a problem with extracting the root type
+             // Password Protected iWorks files are funny, but we can usually
+             //  spot them because they encrypt part of the zip stream 
+             try {
+                stream.read();
+             } catch(UnsupportedZipFeatureException e) {
+                // Compression field was likely encrypted
+                return ENCRYPTED;
+             } catch(Exception ignored) {
              }
           }
           return null;
@@ -180,16 +195,22 @@ public class IWorkPackageParser extends 
                case PAGES:
                   contentHandler = new PagesContentHandler(xhtml, metadata);
                   break;
+               case ENCRYPTED:
+                   // We can't do anything for the file right now
+                   contentHandler = null;
+                   break;
                default:
                   throw new TikaException("Unhandled iWorks file " + type);
                }
 
                metadata.add(Metadata.CONTENT_TYPE, type.getType().toString());
                xhtml.startDocument();
-               context.getSAXParser().parse(
-                       new CloseShieldInputStream(entryStream),
-                       new OfflineContentHandler(contentHandler)
-               );
+               if (contentHandler != null) {
+                  context.getSAXParser().parse(
+                          new CloseShieldInputStream(entryStream),
+                          new OfflineContentHandler(contentHandler)
+                  );
+               }
                xhtml.endDocument();
             }
             

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java?rev=1331503&r1=1331502&r2=1331503&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/iwork/IWorkParserTest.java Fri Apr 27 17:03:23 2012
@@ -139,4 +139,24 @@ public class IWorkParserTest extends Tes
         assertTrue(content.contains("Try adding your own account transactions to this table."));
     }
 
+    /**
+     * We don't currently support password protected Pages files, as
+     *  we don't know how the encryption works (it's not regular Zip
+     *  Encryption). See TIKA-903 for details
+     */
+    public void testParsePagesPasswordProtected() throws Exception {
+       // Document password is "tika", but we can't use that yet...
+       InputStream input = IWorkParserTest.class.getResourceAsStream("/test-documents/testPagesPwdProtected.pages");
+       Metadata metadata = new Metadata();
+       ContentHandler handler = new BodyContentHandler();
+
+       iWorkParser.parse(input, handler, metadata, parseContext);
+
+       // Content will be empty
+       String content = handler.toString();
+       assertEquals("", content);
+       
+       // Will have been identified as encrypted
+       assertEquals("application/x-tika-iworks-protected", metadata.get(Metadata.CONTENT_TYPE));
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesPwdProtected.pages
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesPwdProtected.pages?rev=1331503&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPagesPwdProtected.pages
------------------------------------------------------------------------------
    svn:mime-type = application/zip