You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/22 18:17:54 UTC
[tika] 01/02: TIKA-3332 -- recursively process the embedded file
tree in PDFs.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8bf65c01e174e1ba872e813089b076f21ddb4410
Author: tballison <ta...@apache.org>
AuthorDate: Mon Mar 22 14:17:35 2021 -0400
TIKA-3332 -- recursively process the embedded file tree in PDFs.
---
CHANGES.txt | 3 +
.../src/test/java/org/apache/tika/TikaTest.java | 15 +++--
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 70 +++++++++++++--------
.../org/apache/tika/parser/pdf/PDFParserTest.java | 11 ++++
.../testPDF_deeplyEmbeddedAttachments.pdf | Bin 0 -> 122221 bytes
5 files changed, 67 insertions(+), 32 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 2b9089c..bcc7c5d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.26 - 03/09/2021
+ * Extract more embedded files in PDFs by recursively processing the
+ embedded file tree (TIKA-3332).
+
* Allow for case insensitive headers for configuration of the PDFParser
and the TesseractOCRParser in tika-server via Subhajit Das (TIKA-3320).
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index e21f752..2d0083c 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -247,19 +247,22 @@ public abstract class TikaTest {
}
protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context, boolean suppressException) throws Exception {
- try (TikaInputStream tis = TikaInputStream.get(path)) {
- return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, new Metadata(), suppressException);
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+ return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, metadata, suppressException);
}
}
protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, boolean suppressException) throws Exception {
- try (TikaInputStream tis = TikaInputStream.get(path)) {
- return getRecursiveMetadata(tis, parser, new ParseContext(), new Metadata(), suppressException);
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+ return getRecursiveMetadata(tis, parser, new ParseContext(), metadata, suppressException);
}
}
protected List<Metadata> getRecursiveMetadata(Path p, boolean suppressException) throws Exception {
- try (TikaInputStream tis = TikaInputStream.get(p)) {
- return getRecursiveMetadata(tis, new ParseContext(), new Metadata(), suppressException);
+ Metadata metadata = new Metadata();
+ try (TikaInputStream tis = TikaInputStream.get(p, metadata)) {
+ return getRecursiveMetadata(tis, new ParseContext(), metadata, suppressException);
}
}
protected List<Metadata> getRecursiveMetadata(Path filePath) throws Exception {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 43526ef..f930c61 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -34,6 +34,7 @@ import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
@@ -136,10 +137,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
};
/**
- * Maximum recursive depth during AcroForm processing.
- * Prevents theoretical AcroForm recursion bomb.
+ * Maximum recursive depth to prevent cycles/recursion bombs.
+ * This applies to AcroForm processing and processing
+ * the embedded document tree.
*/
- private final static int MAX_ACROFORM_RECURSIONS = 10;
+ private final static int MAX_RECURSION_DEPTH = 100;
private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig();
@@ -287,32 +289,48 @@ class AbstractPDF2XHTML extends PDFTextStripper {
private void extractEmbeddedDocuments(PDDocument document)
throws IOException, SAXException, TikaException {
- PDDocumentNameDictionary namesDictionary =
- new PDDocumentNameDictionary(document.getDocumentCatalog());
- PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
- if (efTree == null) {
- return;
+ PDDocumentNameDictionary namesDictionary =
+ new PDDocumentNameDictionary(document.getDocumentCatalog());
+ PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
+
+ if (efTree == null) {
+ return;
+ }
+
+ //Set<COSObjectKey> seen = new HashSet<>();
+
+ Map<String, PDComplexFileSpecification> embeddedFileNames = new HashMap<>();
+ int depth = 0;
+ //recursively find embedded files
+ extractFilesfromEFTree(efTree, embeddedFileNames, depth);
+ processEmbeddedDocNames(embeddedFileNames);
+ }
+
+ private void extractFilesfromEFTree(PDNameTreeNode efTree, Map<String,
+ PDComplexFileSpecification> embeddedFileNames, int depth) throws IOException {
+ if (depth > MAX_RECURSION_DEPTH) {
+ throw new IOException("Hit max recursion depth");
+ }
+ Map<String, PDComplexFileSpecification> names = null;
+ try {
+ names = efTree.getNames();
+ } catch (IOException e) {
+ //LOG?
+ }
+ if (names != null) {
+ for (Map.Entry<String, PDComplexFileSpecification> e : names.entrySet()) {
+ embeddedFileNames.put(e.getKey(), e.getValue());
}
+ }
- Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
- //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
- //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
- //If there is a need we could add a fully recursive search to find a non-null
- //Map<String, COSObjectable> that contains the doc info.
- if (embeddedFileNames != null) {
- processEmbeddedDocNames(embeddedFileNames);
+ List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
+ if (kids == null) {
+ return;
} else {
- List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
- if (kids == null) {
- return;
- }
for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
- embeddedFileNames = node.getNames();
- if (embeddedFileNames != null) {
- processEmbeddedDocNames(embeddedFileNames);
- }
+ extractFilesfromEFTree(node, embeddedFileNames, depth+1);
}
- }
+ }
}
private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException {
@@ -803,8 +821,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
private void processAcroField(PDField field, final int currentRecursiveDepth)
throws SAXException, IOException, TikaException {
- if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
- return;
+ if (currentRecursiveDepth >= MAX_RECURSION_DEPTH) {
+ throw new IOException("Hit max recursion depth.");
}
PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions();
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 4ad2b12..c009f06 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -25,8 +25,10 @@ import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.junit.Assume.assumeTrue;
+import java.io.File;
import java.io.InputStream;
import java.nio.file.Path;
+import java.nio.file.Paths;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
@@ -1649,4 +1651,13 @@ public class PDFParserTest extends TikaTest {
return true;
}
}
+
+ @Test
+ public void testDeeplyEmbeddedAttachments() throws Exception {
+ //test file comes from pdfcpu issue #120: https://github.com/pdfcpu/pdfcpu/issues/201
+ //in our regression corpus: pdfcpu-201-0.zip-0.pdf");
+ List<Metadata> metadataList = getRecursiveMetadata(
+ "testPDF_deeplyEmbeddedAttachments.pdf");
+ assertEquals(21, metadataList.size());
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf b/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf
new file mode 100644
index 0000000..7df6d14
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf differ