You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/13 14:55:48 UTC
[tika] 02/02: TIKA-3090 -- extract doc security from ooxml
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit ca6bf65b26ab52c19208457510e16fd5db1ba440
Author: tallison <ta...@apache.org>
AuthorDate: Mon Apr 13 09:33:19 2020 -0400
TIKA-3090 -- extract doc security from ooxml
# Conflicts:
# tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
---
.../tika/metadata/OfficeOpenXMLExtended.java | 18 ++++++++++++++--
.../src/test/java/org/apache/tika/TikaTest.java | 16 ++++++++++++++
.../parser/microsoft/ooxml/MetadataExtractor.java | 24 ++++++++++++++++++++-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 11 ++++++++++
.../test-documents/testWORD_docSecurity.docx | Bin 0 -> 12861 bytes
5 files changed, 66 insertions(+), 3 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
index 5829339..da1f484 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
@@ -34,6 +34,12 @@ public interface OfficeOpenXMLExtended
String WORD_PROCESSING_NAMESPACE_URI = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
String PREFIX = "extended-properties";
String WORD_PROCESSING_PREFIX = "w";
+ String SECURITY_NONE = "None";
+ String SECURITY_PASSWORD_PROTECTED = "PasswordProtected";
+ String SECURITY_READ_ONLY_RECOMMENDED = "ReadOnlyRecommended";
+ String SECURITY_READ_ONLY_ENFORCED = "ReadOnlyEnforced";
+ String SECURITY_LOCKED_FOR_ANNOTATIONS = "LockedForAnnotations";
+ String SECURITY_UNKNOWN = "Unknown";
Property TEMPLATE = Property.externalText(
PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Template");
@@ -60,10 +66,18 @@ public interface OfficeOpenXMLExtended
Property APP_VERSION = Property.externalText(
PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "AppVersion");
-
+
+ //Integer flag
Property DOC_SECURITY = Property.externalInteger(
PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "DocSecurity");
-
+
+ //Human readable string explaining doc security flag
+ Property DOC_SECURITY_STRING = Property.externalClosedChoise(
+ PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER +
+ "DocSecurityString", SECURITY_NONE, SECURITY_PASSWORD_PROTECTED,
+ SECURITY_READ_ONLY_RECOMMENDED, SECURITY_READ_ONLY_ENFORCED,
+ SECURITY_LOCKED_FOR_ANNOTATIONS, SECURITY_UNKNOWN);
+
Property COMMENTS = Property.externalTextBag(
WORD_PROCESSING_PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "comments");
}
\ No newline at end of file
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index efb93b7..5c50ea3 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -29,6 +29,7 @@ import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -490,4 +491,19 @@ public abstract class TikaTest {
return null;
}
+ public List<Path> getAllTestFiles() {
+ //for now, just get main files
+ //TODO: fix this to be recursive
+ try {
+ File[] pathArray = Paths.get(this.getClass().getResource("/test-documents")
+ .toURI()).toFile().listFiles();
+ List<Path> paths = new ArrayList<>();
+ for (File f : pathArray) {
+ paths.add(f.toPath());
+ }
+ return paths;
+ } catch (URISyntaxException e) {
+ throw new RuntimeException(e);
+ }
+ }
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index e5da8ce..9fb8224 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -137,7 +137,10 @@ public class MetadataExtractor {
setProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
setProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate());
setProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime);
-
+ int docSecurityFlag = propsHolder.getDocSecurity();
+ setProperty(metadata, OfficeOpenXMLExtended.DOC_SECURITY, docSecurityFlag);
+ setProperty(metadata, OfficeOpenXMLExtended.DOC_SECURITY_STRING,
+ getDocSecurityString(docSecurityFlag));
if (propsHolder.getPages() > 0) {
metadata.set(PagedText.N_PAGES, propsHolder.getPages());
} else if (propsHolder.getSlides() > 0) {
@@ -171,6 +174,25 @@ public class MetadataExtractor {
setProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
}
+ private String getDocSecurityString(int docSecurityFlag) {
+ //mappings from: https://exiftool.org/TagNames/OOXML.html and
+ //https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.extendedproperties.documentsecurity?view=openxml-2.8.1
+ switch(docSecurityFlag) {
+ case 0:
+ return OfficeOpenXMLExtended.SECURITY_NONE;
+ case 1:
+ return OfficeOpenXMLExtended.SECURITY_PASSWORD_PROTECTED;
+ case 2:
+ return OfficeOpenXMLExtended.SECURITY_READ_ONLY_RECOMMENDED;
+ case 4:
+ return OfficeOpenXMLExtended.SECURITY_READ_ONLY_ENFORCED;
+ case 8:
+ return OfficeOpenXMLExtended.SECURITY_LOCKED_FOR_ANNOTATIONS;
+ default:
+ return OfficeOpenXMLExtended.SECURITY_UNKNOWN;
+ }
+ }
+
private void extractMetadata(POIXMLProperties.CustomProperties properties,
Metadata metadata) {
org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index b48ddae..bdbc9e4 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -31,6 +31,7 @@ import java.io.File;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringWriter;
+import java.nio.file.Path;
import java.text.DecimalFormatSymbols;
import java.util.Arrays;
import java.util.HashMap;
@@ -1786,6 +1787,16 @@ public class OOXMLParserTest extends TikaTest {
assertContains("2018-09-20", xml);
assertContains("1996-08-10", xml);
}
+
+ @Test
+ public void testDocSecurity() throws Exception {
+ assertEquals(OfficeOpenXMLExtended.SECURITY_PASSWORD_PROTECTED,
+ getRecursiveMetadata("protectedFile.xlsx")
+ .get(0).get(OfficeOpenXMLExtended.DOC_SECURITY_STRING));
+ assertEquals(OfficeOpenXMLExtended.SECURITY_READ_ONLY_ENFORCED,
+ getRecursiveMetadata("testWORD_docSecurity.docx")
+ .get(0).get(OfficeOpenXMLExtended.DOC_SECURITY_STRING));
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_docSecurity.docx b/tika-parsers/src/test/resources/test-documents/testWORD_docSecurity.docx
new file mode 100644
index 0000000..14a8196
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_docSecurity.docx differ