You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/22 18:59:29 UTC
[tika] 02/02: TIKA-3332 -- recursively search embedded file tree
for attachments
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 29ef4b5919375d0915c260b6d632ca706ed7e46d
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 22 14:59:17 2021 -0400
TIKA-3332 -- recursively search embedded file tree for attachments
---
CHANGES.txt | 3 ++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 54 +++++++++++++--------
.../org/apache/tika/parser/pdf/PDFParserTest.java | 9 ++++
.../testPDF_deeplyEmbeddedAttachments.pdf | Bin 0 -> 122221 bytes
4 files changed, 47 insertions(+), 19 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 8b83391..e1038e2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -50,6 +50,9 @@ Release 1.26 - ??/??/????
endpoint in tika-server (TIKA-3325); it no longer functions only
per container or embedded document.
+ * Extract more embedded files in PDFs by recursively processing the
+ embedded file tree (TIKA-3332).
+
* Allow for case insensitive headers for configuration of the PDFParser
and the TesseractOCRParser in tika-server via Subhajit Das (TIKA-3320).
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index c97aed1..bb874b9 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -36,6 +36,7 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
@@ -115,10 +116,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
public static final String XMP_PAGE_LOCATION_PREFIX = "page ";
/**
- * Maximum recursive depth during AcroForm processing.
- * Prevents theoretical AcroForm recursion bomb.
+ * Maximum recursive depth to prevent cycles/recursion bombs.
+ * This applies to AcroForm processing and processing
+ * the embedded document tree.
*/
- private final static int MAX_ACROFORM_RECURSIONS = 10;
+ private final static int MAX_RECURSION_DEPTH = 100;
private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");
final List<IOException> exceptions = new ArrayList<>();
@@ -297,23 +299,37 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return;
}
- Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
- //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
- //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
- //If there is a need we could add a fully recursive search to find a non-null
- //Map<String, COSObjectable> that contains the doc info.
- if (embeddedFileNames != null) {
- processEmbeddedDocNames(embeddedFileNames);
- } else {
- List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
- if (kids == null) {
- return;
+ Map<String, PDComplexFileSpecification> embeddedFileNames = new HashMap<>();
+ int depth = 0;
+ //recursively find embedded files
+ extractFilesfromEFTree(efTree, embeddedFileNames, depth);
+ processEmbeddedDocNames(embeddedFileNames);
+
+ }
+
+ private void extractFilesfromEFTree(PDNameTreeNode efTree, Map<String,
+ PDComplexFileSpecification> embeddedFileNames, int depth) throws IOException {
+ if (depth > MAX_RECURSION_DEPTH) {
+ throw new IOException("Hit max recursion depth");
+ }
+ Map<String, PDComplexFileSpecification> names = null;
+ try {
+ names = efTree.getNames();
+ } catch (IOException e) {
+ //LOG?
+ }
+ if (names != null) {
+ for (Map.Entry<String, PDComplexFileSpecification> e : names.entrySet()) {
+ embeddedFileNames.put(e.getKey(), e.getValue());
}
+ }
+
+ List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
+ if (kids == null) {
+ return;
+ } else {
for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
- embeddedFileNames = node.getNames();
- if (embeddedFileNames != null) {
- processEmbeddedDocNames(embeddedFileNames);
- }
+ extractFilesfromEFTree(node, embeddedFileNames, depth+1);
}
}
}
@@ -843,7 +859,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
private void processAcroField(PDField field, final int currentRecursiveDepth)
throws SAXException, IOException, TikaException {
- if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
+ if (currentRecursiveDepth >= MAX_RECURSION_DEPTH) {
return;
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 251a4b7..b0990a3 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1334,4 +1334,13 @@ public class PDFParserTest extends TikaTest {
return true;
}
}
+
+ @Test
+ public void testDeeplyEmbeddedAttachments() throws Exception {
+ //test file comes from pdfcpu issue #120: https://github.com/pdfcpu/pdfcpu/issues/201
+ //in our regression corpus: pdfcpu-201-0.zip-0.pdf");
+ List<Metadata> metadataList = getRecursiveMetadata(
+ "testPDF_deeplyEmbeddedAttachments.pdf");
+ assertEquals(21, metadataList.size());
+ }
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf
new file mode 100644
index 0000000..7df6d14
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf differ