You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/06/15 19:35:51 UTC

[tika] branch TIKA-4082 created (now 4fd6fcd9f)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4082
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 4fd6fcd9f TIKA-4082 -- allow users to choose to have the PDFParser throw an EncryptedDocumentException for PDFs that have an AssociatedFile relationship of EncryptedPayload

This branch includes the following new commits:

     new 4fd6fcd9f TIKA-4082 -- allow users to choose to have the PDFParser throw an EncryptedDocumentException for PDFs that have an AssociatedFile relationship of EncryptedPayload

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4082 -- allow users to choose to have the PDFParser throw an EncryptedDocumentException for PDFs that have an AssociatedFile relationship of EncryptedPayload

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4082
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4fd6fcd9f53637c1f5e34dc528b6801a27daf37e
Author: tballison <ta...@apache.org>
AuthorDate: Thu Jun 15 15:35:39 2023 -0400

    TIKA-4082 -- allow users to choose to have the PDFParser throw an EncryptedDocumentException for PDFs that have an AssociatedFile relationship of EncryptedPayload
---
 .../java/org/apache/tika/parser/pdf/PDFParser.java |  56 ++++++++++++++++++++-
 .../apache/tika/parser/pdf/PDFParserConfig.java    |  11 ++++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  12 +++++
 .../test-documents/testMicrosoftIRMServices.pdf    | Bin 0 -> 290327 bytes
 4 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index eb2bd4664..c51a5152d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -34,6 +34,8 @@ import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSString;
 import org.apache.pdfbox.io.MemoryUsageSetting;
 import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
 import org.apache.pdfbox.io.RandomAccessRead;
@@ -130,6 +132,10 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
      */
     private static final long serialVersionUID = -752276948656079347L;
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
+
+    private static COSName AF_RELATIONSHIP = COSName.getPDFName("AFRelationship");
+
+    private static COSName ENCRYPTED_PAYLOAD = COSName.getPDFName("EncryptedPayload");
     private PDFParserConfig defaultConfig = new PDFParserConfig();
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -188,7 +194,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
             if (tstream != null) {
                 tstream.setOpenContainer(pdfDocument);
             }
-
+            checkEncryptedPayload(pdfDocument, localConfig);
             boolean hasXFA = hasXFA(pdfDocument, metadata);
             boolean hasMarkedContent = hasMarkedContent(pdfDocument, metadata);
             extractMetadata(pdfDocument, metadata, context);
@@ -238,6 +244,32 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         }
     }
 
+    private void checkEncryptedPayload(PDDocument pdfDocument, PDFParserConfig localConfig)
+            throws IOException, EncryptedDocumentException {
+        if (! localConfig.isThrowOnEncryptedPayload()) {
+            return;
+        }
+        List<COSObject> fileSpecs = pdfDocument.getDocument().getObjectsByType(COSName.FILESPEC);
+        //Do we want to also check that this is a portfolio PDF/contains a "collection"?
+        for (COSObject obj : fileSpecs) {
+            if (obj.getObject() instanceof COSDictionary) {
+                COSBase relationship = obj.getDictionaryObject(AF_RELATIONSHIP);
+                if (relationship != null && relationship.equals(ENCRYPTED_PAYLOAD)) {
+                    String name = "";
+                    COSBase uf = obj.getDictionaryObject(COSName.UF);
+                    COSBase f = obj.getDictionaryObject(COSName.F);
+                    if (uf != null && uf instanceof COSString) {
+                        name = ((COSString)uf).getString();
+                    } else if (f != null && f instanceof COSString) {
+                        name = ((COSString)f).getString();
+                    }
+                    throw new EncryptedDocumentException("PDF file contains an encrypted " +
+                                    "payload: '" + name + "'");
+                }
+            }
+        }
+    }
+
     private void scanXRefOffsets(PDFParserConfig localConfig,
                                  TikaInputStream tikaInputStream,
                                  Metadata metadata,
@@ -986,6 +1018,28 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         return defaultConfig.getMaxIncrementalUpdates();
     }
 
+    /**
+     * If the file contains an embedded file with a defined 'AssociatedFile'
+     * value of 'EncryptedPayload', then throw an {@link EncryptedDocumentException}.
+     *<p>
+     * Microsoft IRM v2 wraps the encrypted document inside a container PDF.
+     * See TIKA-4082.
+     * <p>
+     * The goal of this is to make the user experience the same for
+     * traditionally encrypted files and PDFs that are containers
+     * for `EncryptedPayload`s.
+     * <p>
+     * The default value is <code>false</code>.
+     *
+     * @param throwOnEncryptedPayload
+     */
+    public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) {
+        defaultConfig.setThrowOnEncryptedPayload(throwOnEncryptedPayload);
+    }
+
+    public boolean isThrowOnEncryptedPayload() {
+        return defaultConfig.isThrowOnEncryptedPayload();
+    }
     /**
      * This is a no-op.  There is no need to initialize multiple fields.
      * The regular field loading should happen without this.
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index d401b9608..0ee4b274b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -148,6 +148,8 @@ public class PDFParserConfig implements Serializable {
 
     int maxIncrementalUpdates = 10;
 
+    private boolean throwOnEncryptedPayload = false;
+
     /**
      * @return whether or not to extract only inline image metadata and not render the images
      */
@@ -924,6 +926,15 @@ public class PDFParserConfig implements Serializable {
         userConfigured.add("maxIncrementalUpdates");
     }
 
+    public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) {
+        this.throwOnEncryptedPayload = throwOnEncryptedPayload;
+        userConfigured.add("throwOnEncryptedPayload");
+    }
+
+    public boolean isThrowOnEncryptedPayload() {
+        return throwOnEncryptedPayload;
+    }
+
     public enum OCR_STRATEGY {
         AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION;
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ffa05f393..9b499407b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -21,6 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
@@ -1402,6 +1403,17 @@ public class PDFParserTest extends TikaTest {
         //components we're looking for.
     }
 
+    @Test
+    public void testThrowOnEncryptedPayload() throws Exception {
+        PDFParserConfig pdfParserConfig = new PDFParserConfig();
+        pdfParserConfig.setThrowOnEncryptedPayload(true);
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(PDFParserConfig.class, pdfParserConfig);
+        assertThrows(EncryptedDocumentException.class, () -> {
+            getRecursiveMetadata("testMicrosoftIRMServices.pdf", parseContext);
+        });
+    }
+
     /**
      * TODO -- need to test signature extraction
      */
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf
new file mode 100644
index 000000000..6d827d0db
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf differ