You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/06/20 18:37:29 UTC

[tika] branch TIKA-3666 created (now 3d0d026ab)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-3666
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 3d0d026ab TIKA-3666 -- add detection of OLE2 drm encrypted files

This branch includes the following new commits:

     new 3d0d026ab TIKA-3666 -- add detection of OLE2 drm encrypted files

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-3666 -- add detection of OLE2 drm encrypted files

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3666
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 3d0d026ab9516c77d6321e3d7e7851a82fc97e05
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jun 20 14:37:14 2023 -0400

    TIKA-3666 -- add detection of OLE2 drm encrypted files
---
 .../detect/microsoft/POIFSContainerDetector.java   |  61 ++++++++++++---------
 .../tika/parser/microsoft/WordParserTest.java      |  10 ++++
 .../test-documents/testWORD_protected_drm.doc      | Bin 0 -> 72192 bytes
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index d0571110c..6d082cc4c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -209,6 +209,40 @@ public class POIFSContainerDetector implements Detector {
         if (names == null || names.size() == 0) {
             return OLE;
         }
+        //figure out if encrypted/pw protected first
+        if (names.contains("\u0006DataSpaces")) {
+            //OLE2 drm encrypted -- TIKA-3666
+            if (findRecursively(root, "\tDRMDataSpace", 0, 10)) {
+                return DRM_ENCRYPTED;
+            }
+        }
+
+        if (names.contains("EncryptedPackage")) {
+            if (names.contains("EncryptionInfo")) {
+                // This is a protected OOXML document, which is an OLE2 file
+                //  with an Encrypted Stream which holds the OOXML data
+                // Without decrypting the stream, we can't tell what kind of
+                //  OOXML file we have. Return a general OOXML Protected type,
+                //  and hope the name based detection can guess the rest!
+
+                // This is the standard issue method of encryption for ooxml and
+                // is supported by POI
+
+                //Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
+                //See TIKA-2982
+                return OOXML_PROTECTED;
+            } else if (names.contains("\u0006DataSpaces")) {
+                //Try to look for the DRMEncrypted type (TIKA-3666); as of 5.2.0, this is not
+                // supported by POI, but we should still detect it.
+
+                //Do we also want to look for "DRMEncryptedTransform"?
+                if (findRecursively(root, "DRMEncryptedDataSpace", 0, 10)) {
+                    return DRM_ENCRYPTED;
+                }
+            }
+        }
+
+
         for (String workbookEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
             if (names.contains(workbookEntryName)) {
                 MediaType tmp = processCompObjFormatType(root);
@@ -247,33 +281,6 @@ public class POIFSContainerDetector implements Detector {
         } else if (names.contains("Book")) {
             // Excel 95 or older, we won't be able to parse this....
             return XLS;
-        } else if (names.contains("EncryptedPackage")) {
-            if (names.contains("EncryptionInfo")) {
-                // This is a protected OOXML document, which is an OLE2 file
-                //  with an Encrypted Stream which holds the OOXML data
-                // Without decrypting the stream, we can't tell what kind of
-                //  OOXML file we have. Return a general OOXML Protected type,
-                //  and hope the name based detection can guess the rest!
-
-                // This is the standard issue method of encryption for ooxml and
-                // is supported by POI
-
-                //Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
-                //See TIKA-2982
-                return OOXML_PROTECTED;
-            } else if (names.contains("\u0006DataSpaces")) {
-                //Try to look for the DRMEncrypted type (TIKA-3666); as of 5.2.0, this is not
-                // supported by POI, but we should still detect it.
-
-                //Do we also want to look for "DRMEncryptedTransform"?
-                if (findRecursively(root, "DRMEncryptedDataSpace", 0, 10)) {
-                    return DRM_ENCRYPTED;
-                } else {
-                    return OLE;
-                }
-            } else {
-                return OLE;
-            }
         } else if (names.contains("WordDocument")) {
             return DOC;
         } else if (names.contains("Quill")) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 51d574e82..67f0d4e95 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.microsoft;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
@@ -36,6 +37,7 @@ import org.xml.sax.ContentHandler;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.metadata.DublinCore;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -666,4 +668,12 @@ public class WordParserTest extends TikaTest {
         assertContains("Paragraph one", getXML(
                 "testWORD_specialControlCharacter1415.doc").xml);
     }
+
+    @Test
+    public void testEncryptedDRM() throws Exception {
+        assertThrows(EncryptedDocumentException.class, () -> {
+            //test file from: https://bz.apache.org/bugzilla/show_bug.cgi?id=62848
+            getRecursiveMetadata("testWORD_protected_drm.doc");
+        });
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_protected_drm.doc b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_protected_drm.doc
new file mode 100644
index 000000000..b1983b53b
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testWORD_protected_drm.doc differ