You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/06/16 14:35:21 UTC
[tika] branch main updated: TIKA-4082 (#1196)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new ba7711380 TIKA-4082 (#1196)
ba7711380 is described below
commit ba77113802157ad81c93a231fae562fcadd1140f
Author: Tim Allison <ta...@apache.org>
AuthorDate: Fri Jun 16 10:35:15 2023 -0400
TIKA-4082 (#1196)
* TIKA-4082 -- allow users to choose to have the PDFParser throw an EncryptedDocumentException for PDFs that have an AssociatedFile relationship of EncryptedPayload; throw EncryptedDocumentException if PDFBox can't find the security handler
---
CHANGES.txt | 5 +
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 5 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 113 ++++++++++++++++++---
.../apache/tika/parser/pdf/PDFParserConfig.java | 11 ++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 21 ++++
.../test-documents/testMicrosoftIRMServices.pdf | Bin 0 -> 290327 bytes
6 files changed, 139 insertions(+), 16 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 0a94b2b7f..23a77a66d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,10 @@
Release 2.8.1 - ???
+ * With user configuration, the PDFParser can now throw an EncryptedDocumentException
+ for Microsoft IRM PDF containers with encrypted payloads. Separately,
+ the PDFParser now throws an EncryptedDocumentException instead of an IOException
+ if the security handler cannot be found (TIKA-4082).
+
* Changed default decompressConcatenated to true in CompressorParser.
Users may revert to legacy behavior via tika-config.xml (TIKA-4048).
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 3bd1a90a8..51fd4b63c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -152,7 +152,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");
- private static final COSName AF_RELATIONSHIP = COSName.getPDFName("AFRelationship");
final List<IOException> exceptions = new ArrayList<>();
final PDDocument pdDocument;
final XHTMLContentHandler xhtml;
@@ -478,9 +477,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (!StringUtils.isBlank(spec.getFileDescription())) {
embeddedMetadata.set(PDF.EMBEDDED_FILE_DESCRIPTION, spec.getFileDescription());
}
- String afRelationship = spec.getCOSObject().getNameAsString(AF_RELATIONSHIP);
+ String afRelationship = spec.getCOSObject().getNameAsString(PDFParser.AF_RELATIONSHIP);
if (StringUtils.isBlank(afRelationship)) {
- afRelationship = spec.getCOSObject().getString(AF_RELATIONSHIP);
+ afRelationship = spec.getCOSObject().getString(PDFParser.AF_RELATIONSHIP);
}
if (!StringUtils.isBlank(afRelationship)) {
embeddedMetadata.set(PDF.ASSOCIATED_FILE_RELATIONSHIP, afRelationship);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index eb2bd4664..c83e33024 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -34,6 +34,8 @@ import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
@@ -130,6 +132,10 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
*/
private static final long serialVersionUID = -752276948656079347L;
private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MEDIA_TYPE);
+
+ static COSName AF_RELATIONSHIP = COSName.getPDFName("AFRelationship");
+
+ private static COSName ENCRYPTED_PAYLOAD = COSName.getPDFName("EncryptedPayload");
private PDFParserConfig defaultConfig = new PDFParserConfig();
public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -168,27 +174,27 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
} else {
tstream = TikaInputStream.cast(stream);
}
+
+
scanXRefOffsets(localConfig, tstream, metadata, context);
password = getPassword(metadata, context);
- MemoryUsageSetting memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly();
+ MemoryUsageSetting memoryUsageSetting = null;
if (localConfig.getMaxMainMemoryBytes() >= 0) {
memoryUsageSetting =
MemoryUsageSetting.setupMixed(localConfig.getMaxMainMemoryBytes());
- }
- if (tstream != null && tstream.hasFile()) {
- // File based -- send file directly to PDFBox
- pdfDocument =
- getPDDocument(tstream.getPath(), password,
- memoryUsageSetting, metadata, context);
} else {
- pdfDocument = getPDDocument(CloseShieldInputStream.wrap(stream), password,
- memoryUsageSetting, metadata, context);
- }
- if (tstream != null) {
- tstream.setOpenContainer(pdfDocument);
+ memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly();
}
+ pdfDocument = getPDDocument(stream, tstream, password, memoryUsageSetting, metadata,
+ context);
+
+
+ boolean hasCollection = hasCollection(pdfDocument, metadata);
+
+ checkEncryptedPayload(pdfDocument, hasCollection, localConfig);
+
boolean hasXFA = hasXFA(pdfDocument, metadata);
boolean hasMarkedContent = hasMarkedContent(pdfDocument, metadata);
extractMetadata(pdfDocument, metadata, context);
@@ -238,6 +244,38 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
}
}
+ private void checkEncryptedPayload(PDDocument pdfDocument,
+ boolean hasCollection, PDFParserConfig localConfig)
+ throws IOException, EncryptedDocumentException {
+ if (! localConfig.isThrowOnEncryptedPayload()) {
+ return;
+ }
+ //We require a collection. We could also require that it have View=H(idden)
+ //as the spec suggests for Wrapped encrypted files (7.6.7).
+ if (! hasCollection) {
+ return;
+ }
+ List<COSObject> fileSpecs = pdfDocument.getDocument().getObjectsByType(COSName.FILESPEC);
+ //Do we want to also check that this is a portfolio PDF/contains a "collection"?
+ for (COSObject obj : fileSpecs) {
+ if (obj.getObject() instanceof COSDictionary) {
+ COSBase relationship = obj.getDictionaryObject(AF_RELATIONSHIP);
+ if (relationship != null && relationship.equals(ENCRYPTED_PAYLOAD)) {
+ String name = "";
+ COSBase uf = obj.getDictionaryObject(COSName.UF);
+ COSBase f = obj.getDictionaryObject(COSName.F);
+ if (uf != null && uf instanceof COSString) {
+ name = ((COSString)uf).getString();
+ } else if (f != null && f instanceof COSString) {
+ name = ((COSString)f).getString();
+ }
+ throw new EncryptedDocumentException("PDF file contains an encrypted " +
+ "payload: '" + name + "'");
+ }
+ }
+ }
+ }
+
private void scanXRefOffsets(PDFParserConfig localConfig,
TikaInputStream tikaInputStream,
Metadata metadata,
@@ -414,6 +452,33 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
tstream, metadata, parseContext, PageRangeRequest.RENDER_ALL);
}
+ protected PDDocument getPDDocument(InputStream stream, TikaInputStream tstream, String password,
+ MemoryUsageSetting memoryUsageSetting, Metadata metadata,
+ ParseContext context)
+ throws IOException, EncryptedDocumentException {
+ try {
+ PDDocument pdDocument = null;
+ if (tstream != null && tstream.hasFile()) {
+ // File based -- send file directly to PDFBox
+ pdDocument =
+ getPDDocument(tstream.getPath(), password, memoryUsageSetting, metadata,
+ context);
+ } else {
+ pdDocument = getPDDocument(CloseShieldInputStream.wrap(stream), password,
+ memoryUsageSetting, metadata, context);
+ }
+ if (tstream != null) {
+ tstream.setOpenContainer(pdDocument);
+ }
+ return pdDocument;
+ } catch (IOException e) {
+ if (e.getMessage() != null &&
+ e.getMessage().contains("No security handler for filter")) {
+ throw new EncryptedDocumentException(e);
+ }
+ throw e;
+ }
+ }
protected PDDocument getPDDocument(InputStream inputStream, String password,
MemoryUsageSetting memoryUsageSetting, Metadata metadata,
@@ -509,7 +574,6 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
Boolean.toString(ap.canModifyAnnotations()));
metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));
- hasCollection(document, metadata);
metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(document.isEncrypted()));
if (document.getDocumentCatalog().getLanguage() != null) {
@@ -986,6 +1050,29 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
return defaultConfig.getMaxIncrementalUpdates();
}
+ /**
+ * If the file is a 'Collection' and contains an embedded file with a
+ * defined 'AssociatedFile' value of 'EncryptedPayload', then throw an
+ * {@link EncryptedDocumentException}.
+ *<p>
+ * Microsoft IRM v2 wraps the encrypted document inside a container PDF.
+ * See TIKA-4082.
+ * <p>
+ * The goal of this is to make the user experience the same for
+ * traditionally encrypted files and PDFs that are containers
+ * for `EncryptedPayload`s.
+ * <p>
+ * The default value is <code>false</code>.
+ *
+ * @param throwOnEncryptedPayload
+ */
+ public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) {
+ defaultConfig.setThrowOnEncryptedPayload(throwOnEncryptedPayload);
+ }
+
+ public boolean isThrowOnEncryptedPayload() {
+ return defaultConfig.isThrowOnEncryptedPayload();
+ }
/**
* This is a no-op. There is no need to initialize multiple fields.
* The regular field loading should happen without this.
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index d401b9608..0ee4b274b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -148,6 +148,8 @@ public class PDFParserConfig implements Serializable {
int maxIncrementalUpdates = 10;
+ private boolean throwOnEncryptedPayload = false;
+
/**
* @return whether or not to extract only inline image metadata and not render the images
*/
@@ -924,6 +926,15 @@ public class PDFParserConfig implements Serializable {
userConfigured.add("maxIncrementalUpdates");
}
+ public void setThrowOnEncryptedPayload(boolean throwOnEncryptedPayload) {
+ this.throwOnEncryptedPayload = throwOnEncryptedPayload;
+ userConfigured.add("throwOnEncryptedPayload");
+ }
+
+ public boolean isThrowOnEncryptedPayload() {
+ return throwOnEncryptedPayload;
+ }
+
public enum OCR_STRATEGY {
AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index ffa05f393..cb37992bc 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -21,6 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
@@ -1402,6 +1403,26 @@ public class PDFParserTest extends TikaTest {
//components we're looking for.
}
+ @Test
+ public void testThrowOnEncryptedPayload() throws Exception {
+ PDFParserConfig pdfParserConfig = new PDFParserConfig();
+ pdfParserConfig.setThrowOnEncryptedPayload(true);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(PDFParserConfig.class, pdfParserConfig);
+ assertThrows(EncryptedDocumentException.class, () -> {
+ getRecursiveMetadata("testMicrosoftIRMServices.pdf", parseContext);
+ });
+ }
+
+ @Test
+ public void testAFRelationshipAndException() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testMicrosoftIRMServices.pdf");
+ assertEquals(2, metadataList.size());
+ assertEquals("EncryptedPayload", metadataList.get(1).get(PDF.ASSOCIATED_FILE_RELATIONSHIP));
+ assertContains("EncryptedDocumentException",
+ metadataList.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+
+ }
/**
* TODO -- need to test signature extraction
*/
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf
new file mode 100644
index 000000000..6d827d0db
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testMicrosoftIRMServices.pdf differ