You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/06/16 14:35:21 UTC

[tika] branch main updated: TIKA-4082 (#1196)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new ba7711380 TIKA-4082 (#1196)
ba7711380 is described below

commit ba77113802157ad81c93a231fae562fcadd1140f
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri Jun 16 10:35:15 2023 -0400

    TIKA-4082 (#1196)
    
    * TIKA-4082 -- allow users to choose to have the PDFParser throw an EncryptedDocumentException for PDFs that have an AssociatedFile relationship of EncryptedPayload; throw EncryptedDocumentException if PDFBox can't find the security handler
---
 CHANGES.txt                                        |   5 +
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |   5 +-
 .../java/org/apache/tika/parser/pdf/PDFParser.java | 113 ++++++++++++++++++---
 .../apache/tika/parser/pdf/PDFParserConfig.java    |  11 ++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  21 ++++
 .../test-documents/testMicrosoftIRMServices.pdf    | Bin 0 -> 290327 bytes
 6 files changed, 139 insertions(+), 16 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 0a94b2b7f..23a77a66d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,10 @@
 Release 2.8.1 - ???
 
+   * With user configuration, the PDFParser can now throw an EncryptedDocumentException
+     for Microsoft IRM PDF containers with encrypted payloads. Separately,
+     the PDFParser now throws an EncryptedDocumentException instead of an IOException
+     if the security handler cannot be found (TIKA-4082).
+
    * Changed default decompressConcatenated to true in CompressorParser.
      Users may revert to legacy behavior via tika-config.xml (TIKA-4048).
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 3bd1a90a8..51fd4b63c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -152,7 +152,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
     private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");
 
-    private static final COSName AF_RELATIONSHIP = COSName.getPDFName("AFRelationship");
     final List<IOException> exceptions = new ArrayList<>();
     final PDDocument pdDocument;
     final XHTMLContentHandler xhtml;
@@ -478,9 +477,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         if (!StringUtils.isBlank(spec.getFileDescription())) {
             embeddedMetadata.set(PDF.EMBEDDED_FILE_DESCRIPTION, spec.getFileDescription());
         }
-        String afRelationship = spec.getCOSObject().getNameAsString(AF_RELATIONSHIP);
+        String afRelationship = spec.getCOSObject().getNameAsString(PDFParser.AF_RELATIONSHIP);
         if (StringUtils.isBlank(afRelationship)) {
-            afRelationship = spec.getCOSObject().getString(AF_RELATIONSHIP);
+            afRelationship = spec.getCOSObject().getString(PDFParser.AF_RELATIONSHIP);
         }
         if (!StringUtils.isBlank(afRelationship)) {
             embeddedMetadata.set(PDF.ASSOCIATED_FILE_RELATIONSHIP, afRelationship);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index eb2bd4664..c83e33024 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -34,6 +34,8 @@ import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSString;
 import org.apache.pdfbox.io.MemoryUsageSetting;
 import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
 import org.apache.pdfbox.io.RandomAccessRead;
@@ -130,6 +132,10 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
      */
     private static final long serialVersionUID = -752276948656079347L;
     private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
+
+    static COSName AF_RELATIONSHIP = COSName.getPDFName("AFRelationship");
+
+    private static COSName ENCRYPTED_PAYLOAD = COSName.getPDFName("EncryptedPayload");
     private PDFParserConfig defaultConfig = new PDFParserConfig();
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -168,27 +174,27 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
             } else {
                 tstream = TikaInputStream.cast(stream);
             }
+
+
             scanXRefOffsets(localConfig, tstream, metadata, context);
 
             password = getPassword(metadata, context);
-            MemoryUsageSetting memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly();
+            MemoryUsageSetting memoryUsageSetting = null;
             if (localConfig.getMaxMainMemoryBytes() >= 0) {
                 memoryUsageSetting =
                         MemoryUsageSetting.setupMixed(localConfig.getMaxMainMemoryBytes());
-            }
-            if (tstream != null && tstream.hasFile()) {
-                // File based -- send file directly to PDFBox
-                pdfDocument =
-                        getPDDocument(tstream.getPath(), password,
-                                memoryUsageSetting, metadata, context);
             } else {
-                pdfDocument = getPDDocument(CloseShieldInputStream.wrap(stream), password,
-                        memoryUsageSetting, metadata, context);
-            }
-            if (tstream != null) {
-                tstream.setOpenContainer(pdfDocument);
+                memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly();
             }
 
+            pdfDocument = getPDDocument(stream, tstream, password, memoryUsageSetting, metadata,
+                    context);
+
+
+            boolean hasCollection = hasCollection(pdfDocument, metadata);
+
+            checkEncryptedPayload(pdfDocument, hasCollection, localConfig);
+
             boolean hasXFA = hasXFA(pdfDocument, metadata);
             boolean hasMarkedContent = hasMarkedContent(pdfDocument, metadata);
             extractMetadata(pdfDocument, metadata, context);
@@ -238,6 +244,38 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         }
     }
 
+    private void checkEncryptedPayload(PDDocument pdfDocument,
+                                       boolean hasCollection, PDFParserConfig localConfig)
+            throws IOException, EncryptedDocumentException {
+        if (! localConfig.isThrowOnEncryptedPayload()) {
+            return;
+        }
+        //We require a collection. We could also require that it have View=H(idden)
+        //as the spec suggests for Wrapped encrypted files (7.6.7).
+        if (! hasCollection) {
+            return;
+        }
+        List<COSObject> fileSpecs = pdfDocument.getDocument().getObjectsByType(COSName.FILESPEC);
+        //Do we want to also check that this is a portfolio PDF/contains a "collection"?
+        for (COSObject obj : fileSpecs) {
+            if (obj.getObject() instanceof COSDictionary) {
+                COSBase relationship = obj.getDictionaryObject(AF_RELATIONSHIP);
+                if (relationship != null && relationship.equals(ENCRYPTED_PAYLOAD)) {
+                    String name = "";
+                    COSBase uf = obj.getDictionaryObject(COSName.UF);
+                    COSBase f = obj.getDictionaryObject(COSName.F);
+                    if (uf != null && uf instanceof COSString) {
+                        name = ((COSString)uf).getString();
+                    } else if (f != null && f instanceof COSString) {
+                        name = ((COSString)f).getString();
+                    }
+                    throw new EncryptedDocumentException("PDF file contains an encrypted " +
+                                    "payload: '" + name + "'");
+                }
+            }
+        }
+    }
+
     private void scanXRefOffsets(PDFParserConfig localConfig,
                                  TikaInputStream tikaInputStream,
                                  Metadata metadata,
@@ -414,6 +452,33 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
                 tstream, metadata, parseContext, PageRangeRequest.RENDER_ALL);
     }
 
+    protected PDDocument getPDDocument(InputStream stream, TikaInputStream tstream, String password,
+                                       MemoryUsageSetting memoryUsageSetting, Metadata metadata,
+                                       ParseContext context)
+            throws IOException, EncryptedDocumentException {
+        try {
+            PDDocument pdDocument = null;
+            if (tstream != null && tstream.hasFile()) {
+                // File based -- send file directly to PDFBox
+                pdDocument =
+                        getPDDocument(tstream.getPath(), password, memoryUsageSetting, metadata,
+                                context);
+            } else {
+                pdDocument = getPDDocument(CloseShieldInputStream.wrap(stream), password,
+                        memoryUsageSetting, metadata, context);
+            }
+            if (tstream != null) {
+                tstream.setOpenContainer(pdDocument);
+            }
+            return pdDocument;
+        } catch (IOException e) {
+            if (e.getMessage() != null &&
+                    e.getMessage().contains("No security handler for filter")) {
+                throw new EncryptedDocumentException(e);
+            }
+            throw e;
+        }
+    }
 
     protected PDDocument getPDDocument(InputStream inputStream, String password,
                                        MemoryUsageSetting memoryUsageSetting, Metadata metadata,
@@ -509,7 +574,6 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
                 Boolean.toString(ap.canModifyAnnotations()));
         metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
         metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));
-        hasCollection(document, metadata);
         metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(document.isEncrypted()));
 
         if (document.getDocumentCatalog().getLanguage() != null) {
@@ -986,6 +1050,29 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         return defaultConfig.getMaxIncrementalUpdates();
     }
 
+    /**
+     * If the file is a 'Collection' and contains an embedded file with a
+     * defined 'AssociatedFile' value of 'EncryptedPayload', then throw an
+     * {@link EncryptedDocumentException}.
+     *<p>
+     * Microsoft IRM v2 wraps the encrypted document inside a container PDF.
+     * See TIKA-4082.
+     * <p>
+     * The goal of this is to make the user experience the same for
+     * traditionally encrypted files and PDFs that are containers
+     * for `EncryptedPayload`s.
+     * <p>
+     * The default value is <code>false</code>.
+     *
+     * @param throwOnEncryptedPayload
+     */
+    public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) {
+        defaultConfig.setThrowOnEncryptedPayload(throwOnEncryptedPayload);
+    }
+
+    public boolean isThrowOnEncryptedPayload() {
+        return defaultConfig.isThrowOnEncryptedPayload();
+    }
     /**
      * This is a no-op.  There is no need to initialize multiple fields.
      * The regular field loading should happen without this.
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index d401b9608..0ee4b274b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -148,6 +148,8 @@ public class PDFParserConfig implements Serializable {
 
     int maxIncrementalUpdates = 10;
 
+    private boolean throwOnEncryptedPayload = false;
+
     /**
      * @return whether or not to extract only inline image metadata and not render the images
      */
@@ -924,6 +926,15 @@ public class PDFParserConfig implements Serializable {
         userConfigured.add("maxIncrementalUpdates");
     }
 
+    public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) {
+        this.throwOnEncryptedPayload = throwOnEncryptedPayload;
+        userConfigured.add("throwOnEncryptedPayload");
+    }
+
+    public boolean isThrowOnEncryptedPayload() {
+        return throwOnEncryptedPayload;
+    }
+
     public enum OCR_STRATEGY {
         AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION;
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ffa05f393..cb37992bc 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -21,6 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
@@ -1402,6 +1403,26 @@ public class PDFParserTest extends TikaTest {
         //components we're looking for.
     }
 
+    @Test
+    public void testThrowOnEncryptedPayload() throws Exception {
+        PDFParserConfig pdfParserConfig = new PDFParserConfig();
+        pdfParserConfig.setThrowOnEncryptedPayload(true);
+        ParseContext parseContext = new ParseContext();
+        parseContext.set(PDFParserConfig.class, pdfParserConfig);
+        assertThrows(EncryptedDocumentException.class, () -> {
+            getRecursiveMetadata("testMicrosoftIRMServices.pdf", parseContext);
+        });
+    }
+
+    @Test
+    public void testAFRelationshipAndException() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testMicrosoftIRMServices.pdf");
+        assertEquals(2, metadataList.size());
+        assertEquals("EncryptedPayload", metadataList.get(1).get(PDF.ASSOCIATED_FILE_RELATIONSHIP));
+        assertContains("EncryptedDocumentException",
+                metadataList.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+
+    }
     /**
      * TODO -- need to test signature extraction
      */
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf
new file mode 100644
index 000000000..6d827d0db
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf differ