You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/06/20 19:09:41 UTC

[tika] branch main updated: TIKA-3666 -- add detection of OLE2 drm encrypted files (#1204)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 3471d51ef TIKA-3666 -- add detection of OLE2 drm encrypted files (#1204)
3471d51ef is described below

commit 3471d51ef77dc567c2958a873b131eca554a2882
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Jun 20 15:09:36 2023 -0400

    TIKA-3666 -- add detection of OLE2 drm encrypted files (#1204)
    
    Thanks to Ross Spencer for reopening this issue and linking it to an existing POI issue.
---
 .../detect/microsoft/POIFSContainerDetector.java   |  61 ++++++++++++---------
 .../tika/parser/microsoft/WordParserTest.java      |  10 ++++
 .../test-documents/testWORD_protected_drm.doc      | Bin 0 -> 72192 bytes
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index d0571110c..6d082cc4c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -209,6 +209,40 @@ public class POIFSContainerDetector implements Detector {
         if (names == null || names.size() == 0) {
             return OLE;
         }
+        //figure out if encrypted/pw protected first
+        if (names.contains("\u0006DataSpaces")) {
+            //OLE2 drm encrypted -- TIKA-3666
+            if (findRecursively(root, "\tDRMDataSpace", 0, 10)) {
+                return DRM_ENCRYPTED;
+            }
+        }
+
+        if (names.contains("EncryptedPackage")) {
+            if (names.contains("EncryptionInfo")) {
+                // This is a protected OOXML document, which is an OLE2 file
+                //  with an Encrypted Stream which holds the OOXML data
+                // Without decrypting the stream, we can't tell what kind of
+                //  OOXML file we have. Return a general OOXML Protected type,
+                //  and hope the name based detection can guess the rest!
+
+                // This is the standard issue method of encryption for ooxml and
+                // is supported by POI
+
+                //Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
+                //See TIKA-2982
+                return OOXML_PROTECTED;
+            } else if (names.contains("\u0006DataSpaces")) {
+                //Try to look for the DRMEncrypted type (TIKA-3666); as of 5.2.0, this is not
+                // supported by POI, but we should still detect it.
+
+                //Do we also want to look for "DRMEncryptedTransform"?
+                if (findRecursively(root, "DRMEncryptedDataSpace", 0, 10)) {
+                    return DRM_ENCRYPTED;
+                }
+            }
+        }
+
+
         for (String workbookEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
             if (names.contains(workbookEntryName)) {
                 MediaType tmp = processCompObjFormatType(root);
@@ -247,33 +281,6 @@ public class POIFSContainerDetector implements Detector {
         } else if (names.contains("Book")) {
             // Excel 95 or older, we won't be able to parse this....
             return XLS;
-        } else if (names.contains("EncryptedPackage")) {
-            if (names.contains("EncryptionInfo")) {
-                // This is a protected OOXML document, which is an OLE2 file
-                //  with an Encrypted Stream which holds the OOXML data
-                // Without decrypting the stream, we can't tell what kind of
-                //  OOXML file we have. Return a general OOXML Protected type,
-                //  and hope the name based detection can guess the rest!
-
-                // This is the standard issue method of encryption for ooxml and
-                // is supported by POI
-
-                //Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
-                //See TIKA-2982
-                return OOXML_PROTECTED;
-            } else if (names.contains("\u0006DataSpaces")) {
-                //Try to look for the DRMEncrypted type (TIKA-3666); as of 5.2.0, this is not
-                // supported by POI, but we should still detect it.
-
-                //Do we also want to look for "DRMEncryptedTransform"?
-                if (findRecursively(root, "DRMEncryptedDataSpace", 0, 10)) {
-                    return DRM_ENCRYPTED;
-                } else {
-                    return OLE;
-                }
-            } else {
-                return OLE;
-            }
         } else if (names.contains("WordDocument")) {
             return DOC;
         } else if (names.contains("Quill")) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 51d574e82..67f0d4e95 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
@@ -36,6 +37,7 @@ import org.xml.sax.ContentHandler;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.metadata.DublinCore;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -666,4 +668,12 @@ public class WordParserTest extends TikaTest {
         assertContains("Paragraph one", getXML(
                 "testWORD_specialControlCharacter1415.doc").xml);
     }
+
+    @Test
+    public void testEncryptedDRM() throws Exception {
+        assertThrows(EncryptedDocumentException.class, () -> {
+            //test file from: https://bz.apache.org/bugzilla/show_bug.cgi?id=62848
+            getRecursiveMetadata("testWORD_protected_drm.doc");
+        });
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_protected_drm.doc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_protected_drm.doc
new file mode 100644
index 000000000..b1983b53b
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_protected_drm.doc differ