You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/11/28 14:35:30 UTC
svn commit: r1207152 - in /tika/trunk:
tika-core/src/main/resources/org/apache/tika/mime/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/apache/tika/detect/
Author: nick
Date: Mon Nov 28 13:35:29 2011
New Revision: 1207152
URL: http://svn.apache.org/viewvc?rev=1207152&view=rev
Log:
TIKA-791 POIFS Container Detector support for encrypted OOXML files, plus tests and new (tika specific) mimetype
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1207152&r1=1207151&r2=1207152&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Mon Nov 28 13:35:29 2011
@@ -2774,6 +2774,13 @@
</magic>
</mime-type>
+ <!-- Note - password protected OOXML files are actually stored in -->
+ <!-- an OLE2 (application/x-tika-msoffice) container -->
+ <mime-type type="application/x-tika-ooxml-protected">
+ <sub-class-of type="application/x-tika-ooxml"/>
+ <_comment>Password Protected OOXML File</_comment>
+ </mime-type>
+
<mime-type type="application/x-ustar">
<glob pattern="*.ustar"/>
</mime-type>
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1207152&r1=1207151&r2=1207152&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Mon Nov 28 13:35:29 2011
@@ -80,7 +80,7 @@ public class OfficeParser extends Abstra
OLE10_NATIVE("ole", MediaType.application("x-tika-msoffice")),
WORDDOCUMENT("doc", MediaType.application("msword")),
UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
- ENCRYPTED("ole", MediaType.application("x-tika-msoffice")),
+ ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
PUBLISHER("pub", MediaType.application("x-mspublisher")),
PROJECT("mpp", MediaType.application("vnd.ms-project")),
@@ -242,10 +242,13 @@ public class OfficeParser extends Abstra
Decryptor d = Decryptor.getInstance(info);
try {
+ // TODO Allow the user to specify the password via the ParseContext
if (!d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) {
throw new EncryptedDocumentException();
}
+ // Decrypt the OLE2 stream, and delegate the resulting OOXML
+ // file to the regular OOXML parser for normal handling
OOXMLParser parser = new OOXMLParser();
parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1207152&r1=1207151&r2=1207152&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Mon Nov 28 13:35:29 2011
@@ -47,6 +47,9 @@ public class POIFSContainerDetector impl
/** The OLE base file format */
public static final MediaType OLE = application("x-tika-msoffice");
+
+ /** The protected OOXML base file format */
+ public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
/** Microsoft Excel */
public static final MediaType XLS = application("vnd.ms-excel");
@@ -120,6 +123,15 @@ public class POIFSContainerDetector impl
if (names != null) {
if (names.contains("Workbook")) {
return XLS;
+ } else if (names.contains("EncryptedPackage") &&
+ names.contains("EncryptionInfo") &&
+ names.contains("\u0006DataSpaces")) {
+ // This is a protected OOXML document, which is an OLE2 file
+ // with an Encrypted Stream which holds the OOXML data
+ // Without decrypting the stream, we can't tell what kind of
+ // OOXML file we have. Return a general OOXML Protected type,
+ // and hope the name based detection can guess the rest!
+ return OOXML_PROTECTED;
} else if (names.contains("EncryptedPackage")) {
return OLE;
} else if (names.contains("WordDocument")) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java?rev=1207152&r1=1207151&r2=1207152&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java Mon Nov 28 13:35:29 2011
@@ -140,6 +140,47 @@ public class TestContainerAwareDetector
// With an incorrect filename of a different container type, data trumps filename
assertTypeByNameAndData("testEXCEL.xlsx", "notOldExcel.xls", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
}
+
+ /**
+ * Password Protected OLE2 files are fairly straightforward to detect, as they
+ * have the same structure as regular OLE2 files. (Core streams may be encrypted
+ * however)
+ */
+ public void testDetectProtectedOLE2() throws Exception {
+ assertTypeByData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
+ assertTypeByData("testWORD_protected_passtika.doc", "application/msword");
+ assertTypeByData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
+ assertTypeByNameAndData("testEXCEL_protected_passtika.xls", "application/vnd.ms-excel");
+ assertTypeByNameAndData("testWORD_protected_passtika.doc", "application/msword");
+ assertTypeByNameAndData("testPPT_protected_passtika.ppt", "application/vnd.ms-powerpoint");
+ }
+
+ /**
+ * Password Protected OOXML files are much more tricky beasts to work with.
+ * They have a very different structure to regular OOXML files, and instead
+ * of being ZIP based they are actually an OLE2 file which contains the
+ * OOXML structure within an encrypted stream.
+ * This makes detecting them much harder...
+ */
+ public void testDetectProtectedOOXML() throws Exception {
+ // Encrypted Microsoft Office OOXML files have OLE magic but
+ // special streams, so we can tell they're Protected OOXML
+ assertTypeByData("testEXCEL_protected_passtika.xlsx",
+ "application/x-tika-ooxml-protected");
+ assertTypeByData("testWORD_protected_passtika.docx",
+ "application/x-tika-ooxml-protected");
+ assertTypeByData("testPPT_protected_passtika.pptx",
+ "application/x-tika-ooxml-protected");
+
+ // At the moment, we can't use the name to specialise
+ // See discussions on TIKA-790 for details
+ assertTypeByNameAndData("testEXCEL_protected_passtika.xlsx",
+ "application/x-tika-ooxml-protected");
+ assertTypeByNameAndData("testWORD_protected_passtika.docx",
+ "application/x-tika-ooxml-protected");
+ assertTypeByNameAndData("testPPT_protected_passtika.pptx",
+ "application/x-tika-ooxml-protected");
+ }
/**
* Check that temporary files created by Tika are removed after