You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/18 16:57:37 UTC
[tika] branch main updated: TIKA-3666 -- add detection for files protected by Microsoft's Rights Management Service (RMS).
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 62af5295a TIKA-3666 -- add detection for files protected by Microsoft's Rights Management Service (RMS).
62af5295a is described below
commit 62af5295a1246470d90d5f4bf528b1c48e20adcf
Author: tallison <ta...@apache.org>
AuthorDate: Mon Apr 18 12:57:19 2022 -0400
TIKA-3666 -- add detection for files protected by Microsoft's Rights Management Service (RMS).
---
CHANGES.txt | 3 +
.../detect/microsoft/POIFSContainerDetector.java | 66 ++++++++++++++++++----
.../apache/tika/parser/microsoft/OfficeParser.java | 20 +++++--
3 files changed, 74 insertions(+), 15 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 21203d8e4..98be3ad79 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -21,6 +21,9 @@ Release 2.4.0 - ???
* Allow configurability of the EmbeddedDocumentExtractor used
by the AutoDetectParser (TIKA-3711).
+ * Add detection for files encrypted by Microsoft's Rights Management Service
+ (TIKA-3666).
+
* Fixed regression in 2.3.0 that led to more embedded filenames
than appropriate being written to the content (TIKA-3711).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
index cd5bcf905..4117a03a3 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java
@@ -24,6 +24,7 @@ import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.Collections;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.Set;
import java.util.regex.Pattern;
@@ -58,6 +59,12 @@ public class POIFSContainerDetector implements Detector {
* The protected OOXML base file format
*/
public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
+
+ /**
+ * TIKA-3666 MSOffice or other file encrypted with DRM in an OLE container
+ */
+ public static final MediaType DRM_ENCRYPTED = application("x-tika-ole-drm-encrypted");
+
/**
* General embedded document type within an OLE2 container
*/
@@ -238,18 +245,33 @@ public class POIFSContainerDetector implements Detector {
} else if (names.contains("Book")) {
// Excel 95 or older, we won't be able to parse this....
return XLS;
- } else if (names.contains("EncryptedPackage") && names.contains("EncryptionInfo")) {
- // This is a protected OOXML document, which is an OLE2 file
- // with an Encrypted Stream which holds the OOXML data
- // Without decrypting the stream, we can't tell what kind of
- // OOXML file we have. Return a general OOXML Protected type,
- // and hope the name based detection can guess the rest!
-
- //Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
- //See TIKA-2982
- return OOXML_PROTECTED;
} else if (names.contains("EncryptedPackage")) {
- return OLE;
+ if (names.contains("EncryptionInfo")) {
+ // This is a protected OOXML document, which is an OLE2 file
+ // with an Encrypted Stream which holds the OOXML data
+ // Without decrypting the stream, we can't tell what kind of
+ // OOXML file we have. Return a general OOXML Protected type,
+ // and hope the name based detection can guess the rest!
+
+ // This is the standard issue method of encryption for ooxml and
+ // is supported by POI
+
+ //Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
+ //See TIKA-2982
+ return OOXML_PROTECTED;
+ } else if (names.contains("\u0006DataSpaces")) {
+ //Try to look for the DRMEncrypted type (TIKA-3666); as of 5.2.0, this is not
+ // supported by POI, but we should still detect it.
+
+ //Do we also want to look for "DRMEncryptedTransform"?
+ if (findRecursively(root, "DRMEncryptedDataSpace", 0, 10)) {
+ return DRM_ENCRYPTED;
+ } else {
+ return OLE;
+ }
+ } else {
+ return OLE;
+ }
} else if (names.contains("WordDocument")) {
return DOC;
} else if (names.contains("Quill")) {
@@ -317,6 +339,28 @@ public class POIFSContainerDetector implements Detector {
return OLE;
}
+ private static boolean findRecursively(Entry entry, String targetName, int depth,
+ int maxDepth) {
+ if (entry == null) {
+ return false;
+ }
+ if (entry.getName().equals(targetName)) {
+ return true;
+ }
+ if (depth >= maxDepth) {
+ return false;
+ }
+ if (entry instanceof DirectoryEntry) {
+ for (Iterator<Entry> it = ((DirectoryEntry)entry).getEntries(); it.hasNext(); ) {
+ Entry child = it.next();
+ if (findRecursively(child, targetName, depth + 1, maxDepth)) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
/**
* Is this one of the kinds of formats which uses CompObj to
* store all of their data, eg Star Draw, Star Impress or
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index 3bd22daca..2f0c9245d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets;
import java.security.GeneralSecurityException;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
@@ -73,6 +74,7 @@ public class OfficeParser extends AbstractOfficeParser {
new HashSet<>(Arrays.asList(POIFSDocumentType.WORKBOOK.type,
POIFSDocumentType.OLE10_NATIVE.type, POIFSDocumentType.WORDDOCUMENT.type,
POIFSDocumentType.UNKNOWN.type, POIFSDocumentType.ENCRYPTED.type,
+ POIFSDocumentType.DRMENCRYPTED.type,
POIFSDocumentType.POWERPOINT.type, POIFSDocumentType.PUBLISHER.type,
POIFSDocumentType.PROJECT.type, POIFSDocumentType.VISIO.type,
// Works isn't supported
@@ -270,6 +272,10 @@ public class OfficeParser extends AbstractOfficeParser {
} catch (GeneralSecurityException ex) {
throw new EncryptedDocumentException(ex);
}
+ break;
+ case DRMENCRYPTED:
+ throw new EncryptedDocumentException("DRM encrypted document is not yet supported" +
+ " by Apache POI");
default:
// For unsupported / unhandled types, just the metadata
// is extracted, which happened above
@@ -287,6 +293,7 @@ public class OfficeParser extends AbstractOfficeParser {
COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ),
WORDDOCUMENT("doc", MediaType.application("msword")),
UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
+ DRMENCRYPTED("ole", MediaType.application("x-tika-ole-drm-encrypted")),
ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")),
PUBLISHER("pub", MediaType.application("x-mspublisher")),
@@ -300,6 +307,13 @@ public class OfficeParser extends AbstractOfficeParser {
SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks")),
GRAPH("", MediaType.application("vnd.ms-graph"));
+ static Map<MediaType, POIFSDocumentType> TYPE_MAP = new HashMap<>();
+
+ static {
+ for (POIFSDocumentType t : values()) {
+ TYPE_MAP.put(t.type, t);
+ }
+ }
private final String extension;
private final MediaType type;
@@ -318,10 +332,8 @@ public class OfficeParser extends AbstractOfficeParser {
names.add(entry.getName());
}
MediaType type = POIFSContainerDetector.detect(names, node);
- for (POIFSDocumentType poifsType : values()) {
- if (type.equals(poifsType.type)) {
- return poifsType;
- }
+ if (TYPE_MAP.containsKey(type)) {
+ return TYPE_MAP.get(type);
}
return UNKNOWN;
}