You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/13 14:55:48 UTC

[tika] 02/02: TIKA-3090 -- extract doc security from ooxml

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ca6bf65b26ab52c19208457510e16fd5db1ba440
Author: tallison <ta...@apache.org>
AuthorDate: Mon Apr 13 09:33:19 2020 -0400

    TIKA-3090 -- extract doc security from ooxml
    
    # Conflicts:
    #	tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
---
 .../tika/metadata/OfficeOpenXMLExtended.java       |  18 ++++++++++++++--
 .../src/test/java/org/apache/tika/TikaTest.java    |  16 ++++++++++++++
 .../parser/microsoft/ooxml/MetadataExtractor.java  |  24 ++++++++++++++++++++-
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  11 ++++++++++
 .../test-documents/testWORD_docSecurity.docx       | Bin 0 -> 12861 bytes
 5 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
index 5829339..da1f484 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
@@ -34,6 +34,12 @@ public interface OfficeOpenXMLExtended
     String WORD_PROCESSING_NAMESPACE_URI = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
     String PREFIX = "extended-properties";
     String WORD_PROCESSING_PREFIX = "w";
+    String SECURITY_NONE = "None";
+    String SECURITY_PASSWORD_PROTECTED = "PasswordProtected";
+    String SECURITY_READ_ONLY_RECOMMENDED = "ReadOnlyRecommended";
+    String SECURITY_READ_ONLY_ENFORCED = "ReadOnlyEnforced";
+    String SECURITY_LOCKED_FOR_ANNOTATIONS = "LockedForAnnotations";
+    String SECURITY_UNKNOWN = "Unknown";
 
     Property TEMPLATE = Property.externalText(
     		PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "Template");
@@ -60,10 +66,18 @@ public interface OfficeOpenXMLExtended
     
     Property APP_VERSION = Property.externalText(
     		PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "AppVersion");
-    
+
+    //Integer flag
     Property DOC_SECURITY = Property.externalInteger(
     		PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "DocSecurity");
-    
+
+    //Human readable string explaining doc security flag
+    Property DOC_SECURITY_STRING = Property.externalClosedChoise(
+            PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER +
+            "DocSecurityString", SECURITY_NONE, SECURITY_PASSWORD_PROTECTED,
+            SECURITY_READ_ONLY_RECOMMENDED, SECURITY_READ_ONLY_ENFORCED,
+            SECURITY_LOCKED_FOR_ANNOTATIONS, SECURITY_UNKNOWN);
+
     Property COMMENTS = Property.externalTextBag(
             WORD_PROCESSING_PREFIX + Metadata.NAMESPACE_PREFIX_DELIMITER + "comments");
 }
\ No newline at end of file
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index efb93b7..5c50ea3 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -29,6 +29,7 @@ import java.io.InputStream;
 import java.net.URISyntaxException;
 import java.net.URL;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -490,4 +491,19 @@ public abstract class TikaTest {
         return null;
     }
 
+    public List<Path> getAllTestFiles() {
+        //for now, just get main files
+        //TODO: fix this to be recursive
+        try {
+            File[] pathArray = Paths.get(this.getClass().getResource("/test-documents")
+                    .toURI()).toFile().listFiles();
+            List<Path> paths = new ArrayList<>();
+            for (File f : pathArray) {
+                paths.add(f.toPath());
+            }
+            return paths;
+        } catch (URISyntaxException e) {
+            throw new RuntimeException(e);
+        }
+    }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
index e5da8ce..9fb8224 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/MetadataExtractor.java
@@ -137,7 +137,10 @@ public class MetadataExtractor {
         setProperty(metadata, OfficeOpenXMLExtended.PRESENTATION_FORMAT, propsHolder.getPresentationFormat());
         setProperty(metadata, OfficeOpenXMLExtended.TEMPLATE, propsHolder.getTemplate());
         setProperty(metadata, OfficeOpenXMLExtended.TOTAL_TIME, totalTime);
-
+        int docSecurityFlag = propsHolder.getDocSecurity();
+        setProperty(metadata, OfficeOpenXMLExtended.DOC_SECURITY, docSecurityFlag);
+        setProperty(metadata, OfficeOpenXMLExtended.DOC_SECURITY_STRING,
+                getDocSecurityString(docSecurityFlag));
         if (propsHolder.getPages() > 0) {
             metadata.set(PagedText.N_PAGES, propsHolder.getPages());
         } else if (propsHolder.getSlides() > 0) {
@@ -171,6 +174,25 @@ public class MetadataExtractor {
         setProperty(metadata, MSOffice.CHARACTER_COUNT_WITH_SPACES, propsHolder.getCharactersWithSpaces());
     }
 
+    private String getDocSecurityString(int docSecurityFlag) {
+        //mappings from: https://exiftool.org/TagNames/OOXML.html and
+        //https://docs.microsoft.com/en-us/dotnet/api/documentformat.openxml.extendedproperties.documentsecurity?view=openxml-2.8.1
+        switch(docSecurityFlag) {
+            case 0:
+                return OfficeOpenXMLExtended.SECURITY_NONE;
+            case 1:
+                return OfficeOpenXMLExtended.SECURITY_PASSWORD_PROTECTED;
+            case 2:
+                return OfficeOpenXMLExtended.SECURITY_READ_ONLY_RECOMMENDED;
+            case 4:
+                return OfficeOpenXMLExtended.SECURITY_READ_ONLY_ENFORCED;
+            case 8:
+                return OfficeOpenXMLExtended.SECURITY_LOCKED_FOR_ANNOTATIONS;
+            default:
+                return OfficeOpenXMLExtended.SECURITY_UNKNOWN;
+        }
+    }
+
     private void extractMetadata(POIXMLProperties.CustomProperties properties,
                                  Metadata metadata) {
         org.openxmlformats.schemas.officeDocument.x2006.customProperties.CTProperties
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index b48ddae..bdbc9e4 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -31,6 +31,7 @@ import java.io.File;
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.io.StringWriter;
+import java.nio.file.Path;
 import java.text.DecimalFormatSymbols;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -1786,6 +1787,16 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("2018-09-20", xml);
         assertContains("1996-08-10", xml);
     }
+
+    @Test
+    public void testDocSecurity() throws Exception {
+        assertEquals(OfficeOpenXMLExtended.SECURITY_PASSWORD_PROTECTED,
+                getRecursiveMetadata("protectedFile.xlsx")
+                .get(0).get(OfficeOpenXMLExtended.DOC_SECURITY_STRING));
+        assertEquals(OfficeOpenXMLExtended.SECURITY_READ_ONLY_ENFORCED,
+                getRecursiveMetadata("testWORD_docSecurity.docx")
+                        .get(0).get(OfficeOpenXMLExtended.DOC_SECURITY_STRING));
+    }
 }
 
 
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_docSecurity.docx b/tika-parsers/src/test/resources/test-documents/testWORD_docSecurity.docx
new file mode 100644
index 0000000..14a8196
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_docSecurity.docx differ