You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/18 16:57:37 UTC
[tika] branch main updated: TIKA-3666 -- add detection for files protected by Microsoft's Rights Management Service (RMS).

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 62af5295a TIKA-3666 -- add detection for files protected by Microsoft's Rights Management Service (RMS).
62af5295a is described below

commit 62af5295a1246470d90d5f4bf528b1c48e20adcf
Author: tallison <ta...@apache.org>
AuthorDate: Mon Apr 18 12:57:19 2022 -0400

    TIKA-3666 -- add detection for files protected by Microsoft's Rights Management Service (RMS).
---
 CHANGES.txt                                        |  3 +
 .../detect/microsoft/POIFSContainerDetector.java   | 66 ++++++++++++++++++----
 .../apache/tika/parser/microsoft/OfficeParser.java | 20 +++++--
 3 files changed, 74 insertions(+), 15 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 21203d8e4..98be3ad79 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -21,6 +21,9 @@ Release 2.4.0 - ???
    * Allow configurability of the EmbeddedDocumentExtractor used
      by the AutoDetectParser (TIKA-3711).
 
+   * Add detection for files encrypted by Microsoft's Rights Management Service
+     (TIKA-3666).
+
    * Fixed regression in 2.3.0 that led to more embedded filenames
      than appropriate being written to the content (TIKA-3711).
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index cd5bcf905..4117a03a3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -24,6 +24,7 @@ import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.util.Collections;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Set;
 import java.util.regex.Pattern;
 
@@ -58,6 +59,12 @@ public class POIFSContainerDetector implements Detector {
      * The protected OOXML base file format
      */
     public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
+
+    /**
+     * TIKA-3666 MSOffice or other file encrypted with DRM in an OLE container
+     */
+    public static final MediaType DRM_ENCRYPTED = application("x-tika-ole-drm-encrypted");
+
     /**
      * General embedded document type within an OLE2 container
      */
@@ -238,18 +245,33 @@ public class POIFSContainerDetector implements Detector {
         } else if (names.contains("Book")) {
             // Excel 95 or older, we won't be able to parse this....
             return XLS;
-        } else if (names.contains("EncryptedPackage") && names.contains("EncryptionInfo")) {
-            // This is a protected OOXML document, which is an OLE2 file
-            //  with an Encrypted Stream which holds the OOXML data
-            // Without decrypting the stream, we can't tell what kind of
-            //  OOXML file we have. Return a general OOXML Protected type,
-            //  and hope the name based detection can guess the rest!
-
-            //Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
-            //See TIKA-2982
-            return OOXML_PROTECTED;
         } else if (names.contains("EncryptedPackage")) {
-            return OLE;
+            if (names.contains("EncryptionInfo")) {
+                // This is a protected OOXML document, which is an OLE2 file
+                //  with an Encrypted Stream which holds the OOXML data
+                // Without decrypting the stream, we can't tell what kind of
+                //  OOXML file we have. Return a general OOXML Protected type,
+                //  and hope the name based detection can guess the rest!
+
+                // This is the standard issue method of encryption for ooxml and
+                // is supported by POI
+
+                //Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
+                //See TIKA-2982
+                return OOXML_PROTECTED;
+            } else if (names.contains("\u0006DataSpaces")) {
+                //Try to look for the DRMEncrypted type (TIKA-3666); as of 5.2.0, this is not
+                // supported by POI, but we should still detect it.
+
+                //Do we also want to look for "DRMEncryptedTransform"?
+                if (findRecursively(root, "DRMEncryptedDataSpace", 0, 10)) {
+                    return DRM_ENCRYPTED;
+                } else {
+                    return OLE;
+                }
+            } else {
+                return OLE;
+            }
         } else if (names.contains("WordDocument")) {
             return DOC;
         } else if (names.contains("Quill")) {
@@ -317,6 +339,28 @@ public class POIFSContainerDetector implements Detector {
         return OLE;
     }
 
+    private static boolean findRecursively(Entry entry, String targetName, int depth,
+                                           int maxDepth) {
+        if (entry == null) {
+            return false;
+        }
+        if (entry.getName().equals(targetName)) {
+            return true;
+        }
+        if (depth >= maxDepth) {
+            return false;
+        }
+        if (entry instanceof DirectoryEntry) {
+            for (Iterator<Entry> it = ((DirectoryEntry)entry).getEntries(); it.hasNext(); ) {
+                Entry child = it.next();
+                if (findRecursively(child, targetName, depth + 1, maxDepth)) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
     /**
      * Is this one of the kinds of formats which uses CompObj to
      * store all of their data, eg Star Draw, Star Impress or
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 3bd22daca..2f0c9245d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets;
 import java.security.GeneralSecurityException;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.Map;
@@ -73,6 +74,7 @@ public class OfficeParser extends AbstractOfficeParser {
             new HashSet<>(Arrays.asList(POIFSDocumentType.WORKBOOK.type,
                     POIFSDocumentType.OLE10_NATIVE.type, POIFSDocumentType.WORDDOCUMENT.type,
                     POIFSDocumentType.UNKNOWN.type, POIFSDocumentType.ENCRYPTED.type,
+                    POIFSDocumentType.DRMENCRYPTED.type,
                     POIFSDocumentType.POWERPOINT.type, POIFSDocumentType.PUBLISHER.type,
                     POIFSDocumentType.PROJECT.type, POIFSDocumentType.VISIO.type,
                     // Works isn't supported
@@ -270,6 +272,10 @@ public class OfficeParser extends AbstractOfficeParser {
                 } catch (GeneralSecurityException ex) {
                     throw new EncryptedDocumentException(ex);
                 }
+                break;
+            case DRMENCRYPTED:
+                throw new EncryptedDocumentException("DRM encrypted document is not yet supported" +
+                        " by Apache POI");
             default:
                 // For unsupported / unhandled types, just the metadata
                 //  is extracted, which happened above
@@ -287,6 +293,7 @@ public class OfficeParser extends AbstractOfficeParser {
         COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ),
         WORDDOCUMENT("doc", MediaType.application("msword")),
         UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
+        DRMENCRYPTED("ole", MediaType.application("x-tika-ole-drm-encrypted")),
         ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
         POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
         PUBLISHER("pub", MediaType.application("x-mspublisher")),
@@ -300,6 +307,13 @@ public class OfficeParser extends AbstractOfficeParser {
         SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks")),
         GRAPH("", MediaType.application("vnd.ms-graph"));
 
+        static Map<MediaType, POIFSDocumentType> TYPE_MAP = new HashMap<>();
+
+        static {
+            for (POIFSDocumentType t : values()) {
+                TYPE_MAP.put(t.type, t);
+            }
+        }
         private final String extension;
         private final MediaType type;
 
@@ -318,10 +332,8 @@ public class OfficeParser extends AbstractOfficeParser {
                 names.add(entry.getName());
             }
             MediaType type = POIFSContainerDetector.detect(names, node);
-            for (POIFSDocumentType poifsType : values()) {
-                if (type.equals(poifsType.type)) {
-                    return poifsType;
-                }
+            if (TYPE_MAP.containsKey(type)) {
+                return TYPE_MAP.get(type);
             }
             return UNKNOWN;
         }