You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/11/28 14:35:30 UTC

svn commit: r1207152 - in /tika/trunk: tika-core/src/main/resources/org/apache/tika/mime/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/detect/

Author: nick
Date: Mon Nov 28 13:35:29 2011
New Revision: 1207152

URL: http://svn.apache.org/viewvc?rev=1207152&view=rev
Log:
TIKA-791 POIFS Container Detector support for encrypted OOXML files, plus tests and new (tika specific) mimetype

Modified:
    tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1207152&r1=1207151&r2=1207152&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Mon Nov 28 13:35:29 2011
@@ -2774,6 +2774,13 @@
     </magic>
   </mime-type>
 
+  <!-- Note - password protected OOXML files are actually stored in -->
+  <!--  an OLE2 (application/x-tika-msoffice) container -->
+  <mime-type type="application/x-tika-ooxml-protected">
+    <sub-class-of type="application/x-tika-ooxml"/>
+    <_comment>Password Protected OOXML File</_comment>
+  </mime-type>
+
   <mime-type type="application/x-ustar">
     <glob pattern="*.ustar"/>
   </mime-type>

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1207152&r1=1207151&r2=1207152&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Mon Nov 28 13:35:29 2011
@@ -80,7 +80,7 @@ public class OfficeParser extends Abstra
         OLE10_NATIVE("ole", MediaType.application("x-tika-msoffice")),
         WORDDOCUMENT("doc", MediaType.application("msword")),
         UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
-        ENCRYPTED("ole", MediaType.application("x-tika-msoffice")),
+        ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
         POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
         PUBLISHER("pub", MediaType.application("x-mspublisher")),
         PROJECT("mpp", MediaType.application("vnd.ms-project")),
@@ -242,10 +242,13 @@ public class OfficeParser extends Abstra
                     Decryptor d = Decryptor.getInstance(info);
 
                     try {
+                        // TODO Allow the user to specify the password via the ParseContext
                         if (!d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) {
                             throw new EncryptedDocumentException();
                         }
 
+                        // Decrypt the OLE2 stream, and delegate the resulting OOXML
+                        //  file to the regular OOXML parser for normal handling
                         OOXMLParser parser = new OOXMLParser();
 
                         parser.parse(d.getDataStream(root), new EmbeddedContentHandler(

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1207152&r1=1207151&r2=1207152&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Mon Nov 28 13:35:29 2011
@@ -47,6 +47,9 @@ public class POIFSContainerDetector impl
 
     /** The OLE base file format */
     public static final MediaType OLE = application("x-tika-msoffice");
+    
+    /** The protected OOXML base file format */
+    public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
 
     /** Microsoft Excel */
     public static final MediaType XLS = application("vnd.ms-excel");
@@ -120,6 +123,15 @@ public class POIFSContainerDetector impl
         if (names != null) {
             if (names.contains("Workbook")) {
                 return XLS;
+            } else if (names.contains("EncryptedPackage") && 
+                    names.contains("EncryptionInfo") &&
+                    names.contains("\u0006DataSpaces")) {
+                // This is a protected OOXML document, which is an OLE2 file
+                //  with an Encrypted Stream which holds the OOXML data
+                // Without decrypting the stream, we can't tell what kind of
+                //  OOXML file we have. Return a general OOXML Protected type,
+                //  and hope the name based detection can guess the rest! 
+                return OOXML_PROTECTED;
             } else if (names.contains("EncryptedPackage")) {
                 return OLE;
             } else if (names.contains("WordDocument")) {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1207152&r1=1207151&r2=1207152&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Mon Nov 28 13:35:29 2011
@@ -140,6 +140,47 @@ public class TestContainerAwareDetector 
         // With an incorrect filename of a different container type, data trumps filename
         assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
     }
+    
+    /**
+     * Password Protected OLE2 files are fairly straightforward to detect, as they
+     *  have the same structure as regular OLE2 files. (Core streams may be encrypted
+     *  however)
+     */
+    public void testDetectProtectedOLE2() throws Exception {
+        assertTypeByData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
+        assertTypeByData("testWORD_protected_passtika.doc", "application/msword");
+        assertTypeByData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
+        assertTypeByNameAndData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
+        assertTypeByNameAndData("testWORD_protected_passtika.doc", "application/msword");
+        assertTypeByNameAndData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
+    }
+
+    /**
+     * Password Protected OOXML files are much more tricky beasts to work with.
+     * They have a very different structure to regular OOXML files, and instead
+     *  of being ZIP based they are actually an OLE2 file which contains the
+     *  OOXML structure within an encrypted stream.
+     * This makes detecting them much harder...
+     */
+    public void testDetectProtectedOOXML() throws Exception {
+        // Encrypted Microsoft Office OOXML files have OLE magic but
+        //  special streams, so we can tell they're Protected OOXML
+        assertTypeByData("testEXCEL_protected_passtika.xlsx", 
+                "application/x-tika-ooxml-protected");
+        assertTypeByData("testWORD_protected_passtika.docx", 
+                "application/x-tika-ooxml-protected");
+        assertTypeByData("testPPT_protected_passtika.pptx", 
+                "application/x-tika-ooxml-protected");
+        
+        // At the moment, we can't use the name to specialise
+        // See discussions on TIKA-790 for details
+        assertTypeByNameAndData("testEXCEL_protected_passtika.xlsx", 
+                "application/x-tika-ooxml-protected");
+        assertTypeByNameAndData("testWORD_protected_passtika.docx", 
+                "application/x-tika-ooxml-protected");
+        assertTypeByNameAndData("testPPT_protected_passtika.pptx", 
+                "application/x-tika-ooxml-protected");
+    }
 
     /**
      * Check that temporary files created by Tika are removed after