You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/06/20 19:09:41 UTC
[tika] branch main updated: TIKA-3666 -- add detection of OLE2 drm encrypted files (#1204)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 3471d51ef TIKA-3666 -- add detection of OLE2 drm encrypted files (#1204)
3471d51ef is described below
commit 3471d51ef77dc567c2958a873b131eca554a2882
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Jun 20 15:09:36 2023 -0400
TIKA-3666 -- add detection of OLE2 drm encrypted files (#1204)
Thanks to Ross Spencer for reopening this issue and linking it to an existing POI issue.
---
.../detect/microsoft/POIFSContainerDetector.java | 61 ++++++++++++---------
.../tika/parser/microsoft/WordParserTest.java | 10 ++++
.../test-documents/testWORD_protected_drm.doc | Bin 0 -> 72192 bytes
3 files changed, 44 insertions(+), 27 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index d0571110c..6d082cc4c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -209,6 +209,40 @@ public class POIFSContainerDetector implements Detector {
if (names == null || names.size() == 0) {
return OLE;
}
+ //figure out if encrypted/pw protected first
+ if (names.contains("\u0006DataSpaces")) {
+ //OLE2 drm encrypted -- TIKA-3666
+ if (findRecursively(root, "\tDRMDataSpace", 0, 10)) {
+ return DRM_ENCRYPTED;
+ }
+ }
+
+ if (names.contains("EncryptedPackage")) {
+ if (names.contains("EncryptionInfo")) {
+ // This is a protected OOXML document, which is an OLE2 file
+ // with an Encrypted Stream which holds the OOXML data
+ // Without decrypting the stream, we can't tell what kind of
+ // OOXML file we have. Return a general OOXML Protected type,
+ // and hope the name based detection can guess the rest!
+
+ // This is the standard issue method of encryption for ooxml and
+ // is supported by POI
+
+ //Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
+ //See TIKA-2982
+ return OOXML_PROTECTED;
+ } else if (names.contains("\u0006DataSpaces")) {
+ //Try to look for the DRMEncrypted type (TIKA-3666); as of 5.2.0, this is not
+ // supported by POI, but we should still detect it.
+
+ //Do we also want to look for "DRMEncryptedTransform"?
+ if (findRecursively(root, "DRMEncryptedDataSpace", 0, 10)) {
+ return DRM_ENCRYPTED;
+ }
+ }
+ }
+
+
for (String workbookEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
if (names.contains(workbookEntryName)) {
MediaType tmp = processCompObjFormatType(root);
@@ -247,33 +281,6 @@ public class POIFSContainerDetector implements Detector {
} else if (names.contains("Book")) {
// Excel 95 or older, we won't be able to parse this....
return XLS;
- } else if (names.contains("EncryptedPackage")) {
- if (names.contains("EncryptionInfo")) {
- // This is a protected OOXML document, which is an OLE2 file
- // with an Encrypted Stream which holds the OOXML data
- // Without decrypting the stream, we can't tell what kind of
- // OOXML file we have. Return a general OOXML Protected type,
- // and hope the name based detection can guess the rest!
-
- // This is the standard issue method of encryption for ooxml and
- // is supported by POI
-
- //Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
- //See TIKA-2982
- return OOXML_PROTECTED;
- } else if (names.contains("\u0006DataSpaces")) {
- //Try to look for the DRMEncrypted type (TIKA-3666); as of 5.2.0, this is not
- // supported by POI, but we should still detect it.
-
- //Do we also want to look for "DRMEncryptedTransform"?
- if (findRecursively(root, "DRMEncryptedDataSpace", 0, 10)) {
- return DRM_ENCRYPTED;
- } else {
- return OLE;
- }
- } else {
- return OLE;
- }
} else if (names.contains("WordDocument")) {
return DOC;
} else if (names.contains("Quill")) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 51d574e82..67f0d4e95 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
@@ -36,6 +37,7 @@ import org.xml.sax.ContentHandler;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -666,4 +668,12 @@ public class WordParserTest extends TikaTest {
assertContains("Paragraph one", getXML(
"testWORD_specialControlCharacter1415.doc").xml);
}
+
+ @Test
+ public void testEncryptedDRM() throws Exception {
+ assertThrows(EncryptedDocumentException.class, () -> {
+ //test file from: https://bz.apache.org/bugzilla/show_bug.cgi?id=62848
+ getRecursiveMetadata("testWORD_protected_drm.doc");
+ });
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_protected_drm.doc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_protected_drm.doc
new file mode 100644
index 000000000..b1983b53b
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_protected_drm.doc differ