You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/22 18:17:54 UTC
[tika] 01/02: TIKA-3332 -- recursively process the embedded file tree in PDFs.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 8bf65c01e174e1ba872e813089b076f21ddb4410
Author: tballison <ta...@apache.org>
AuthorDate: Mon Mar 22 14:17:35 2021 -0400

    TIKA-3332 -- recursively process the embedded file tree in PDFs.
---
 CHANGES.txt                                        |   3 +
 .../src/test/java/org/apache/tika/TikaTest.java    |  15 +++--
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  70 +++++++++++++--------
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  11 ++++
 .../testPDF_deeplyEmbeddedAttachments.pdf          | Bin 0 -> 122221 bytes
 5 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 2b9089c..bcc7c5d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.26 - 03/09/2021
 
+   * Extract more embedded files in PDFs by recursively processing the
+     embedded file tree (TIKA-3332).
+
    * Allow for case insensitive headers for configuration of the PDFParser
      and the TesseractOCRParser in tika-server via Subhajit Das (TIKA-3320).
 
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index e21f752..2d0083c 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -247,19 +247,22 @@ public abstract class TikaTest {
     }
 
     protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context, boolean suppressException) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(path)) {
-            return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, new Metadata(), suppressException);
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+            return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, metadata, suppressException);
         }
     }
     protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, boolean suppressException) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(path)) {
-            return getRecursiveMetadata(tis, parser, new ParseContext(), new Metadata(), suppressException);
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+            return getRecursiveMetadata(tis, parser, new ParseContext(), metadata, suppressException);
         }
     }
 
     protected List<Metadata> getRecursiveMetadata(Path p, boolean suppressException) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(p)) {
-            return getRecursiveMetadata(tis, new ParseContext(), new Metadata(), suppressException);
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(p, metadata)) {
+            return getRecursiveMetadata(tis, new ParseContext(), metadata, suppressException);
         }
     }
     protected List<Metadata> getRecursiveMetadata(Path filePath) throws Exception {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 43526ef..f930c61 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -34,6 +34,7 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.ListIterator;
@@ -136,10 +137,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     };
 
     /**
-     * Maximum recursive depth during AcroForm processing.
-     * Prevents theoretical AcroForm recursion bomb.
+     * Maximum recursive depth to prevent cycles/recursion bombs.
+     * This applies to AcroForm processing and processing
+     * the embedded document tree.
      */
-    private final static int MAX_ACROFORM_RECURSIONS = 10;
+    private final static int MAX_RECURSION_DEPTH = 100;
 
     private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig();
 
@@ -287,32 +289,48 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
     private void extractEmbeddedDocuments(PDDocument document)
             throws IOException, SAXException, TikaException {
-            PDDocumentNameDictionary namesDictionary =
-                    new PDDocumentNameDictionary(document.getDocumentCatalog());
-            PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
-            if (efTree == null) {
-                return;
+        PDDocumentNameDictionary namesDictionary =
+                new PDDocumentNameDictionary(document.getDocumentCatalog());
+        PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
+
+        if (efTree == null) {
+            return;
+        }
+
+        //Set<COSObjectKey> seen = new HashSet<>();
+
+        Map<String, PDComplexFileSpecification> embeddedFileNames = new HashMap<>();
+        int depth = 0;
+        //recursively find embedded files
+        extractFilesfromEFTree(efTree, embeddedFileNames, depth);
+        processEmbeddedDocNames(embeddedFileNames);
+    }
+
+    private void extractFilesfromEFTree(PDNameTreeNode efTree, Map<String,
+            PDComplexFileSpecification> embeddedFileNames, int depth) throws IOException {
+        if (depth > MAX_RECURSION_DEPTH) {
+            throw new IOException("Hit max recursion depth");
+        }
+        Map<String, PDComplexFileSpecification> names = null;
+        try {
+            names = efTree.getNames();
+        } catch (IOException e) {
+            //LOG?
+        }
+        if (names != null) {
+            for (Map.Entry<String, PDComplexFileSpecification> e : names.entrySet()) {
+                embeddedFileNames.put(e.getKey(), e.getValue());
             }
+        }
 
-        Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
-        //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
-        //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
-        //If there is a need we could add a fully recursive search to find a non-null
-        //Map<String, COSObjectable> that contains the doc info.
-        if (embeddedFileNames != null) {
-            processEmbeddedDocNames(embeddedFileNames);
+        List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
+        if (kids == null) {
+            return;
         } else {
-            List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
-            if (kids == null) {
-                return;
-            }
             for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
-                embeddedFileNames = node.getNames();
-                if (embeddedFileNames != null) {
-                    processEmbeddedDocNames(embeddedFileNames);
-                }
+                extractFilesfromEFTree(node, embeddedFileNames, depth+1);
             }
-        }
+       }
     }
 
     private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException {
@@ -803,8 +821,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private void processAcroField(PDField field, final int currentRecursiveDepth)
             throws SAXException, IOException, TikaException {
 
-        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
-            return;
+        if (currentRecursiveDepth >= MAX_RECURSION_DEPTH) {
+            throw new IOException("Hit max recursion depth.");
         }
 
         PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions();
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 4ad2b12..c009f06 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -25,8 +25,10 @@ import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 import static org.junit.Assume.assumeTrue;
 
+import java.io.File;
 import java.io.InputStream;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -1649,4 +1651,13 @@ public class PDFParserTest extends TikaTest {
             return true;
         }
     }
+
+    @Test
+    public void testDeeplyEmbeddedAttachments() throws Exception {
+        //test file comes from pdfcpu issue #120: https://github.com/pdfcpu/pdfcpu/issues/201
+        //in our regression corpus: pdfcpu-201-0.zip-0.pdf");
+        List<Metadata> metadataList = getRecursiveMetadata(
+                "testPDF_deeplyEmbeddedAttachments.pdf");
+        assertEquals(21, metadataList.size());
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf b/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf
new file mode 100644
index 0000000..7df6d14
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf differ