You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/22 18:59:27 UTC

[tika] branch main updated (1766166 -> 29ef4b5)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 1766166  TIKA-3322 -- upgrade PDFBox to 2.0.23
     new 33a4f42  clean up dependencies
     new 29ef4b5  TIKA-3332 -- recursively search embedded file tree for attachments

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   3 ++
 tika-parent/pom.xml                                |   4 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  54 +++++++++++++--------
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |   9 ++++
 .../testPDF_deeplyEmbeddedAttachments.pdf          | Bin 0 -> 122221 bytes
 5 files changed, 50 insertions(+), 20 deletions(-)
 create mode 100644 tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf

[tika] 02/02: TIKA-3332 -- recursively search embedded file tree for attachments

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 29ef4b5919375d0915c260b6d632ca706ed7e46d
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 22 14:59:17 2021 -0400

    TIKA-3332 -- recursively search embedded file tree for attachments
---
 CHANGES.txt                                        |   3 ++
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  54 +++++++++++++--------
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |   9 ++++
 .../testPDF_deeplyEmbeddedAttachments.pdf          | Bin 0 -> 122221 bytes
 4 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 8b83391..e1038e2 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -50,6 +50,9 @@ Release 1.26 - ??/??/????
      endpoint in tika-server (TIKA-3325); it no longer functions only
      per container or embedded document.
 
+   * Extract more embedded files in PDFs by recursively processing the
+     embedded file tree (TIKA-3332).
+
    * Allow for case insensitive headers for configuration of the PDFParser
      and the TesseractOCRParser in tika-server via Subhajit Das (TIKA-3320).
 
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index c97aed1..bb874b9 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -36,6 +36,7 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.ListIterator;
@@ -115,10 +116,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
     public static final String XMP_PAGE_LOCATION_PREFIX = "page ";
     /**
-     * Maximum recursive depth during AcroForm processing.
-     * Prevents theoretical AcroForm recursion bomb.
+     * Maximum recursive depth to prevent cycles/recursion bombs.
+     * This applies to AcroForm processing and processing
+     * the embedded document tree.
      */
-    private final static int MAX_ACROFORM_RECURSIONS = 10;
+    private final static int MAX_RECURSION_DEPTH = 100;
     private static final MediaType XFA_MEDIA_TYPE = MediaType.application("vnd.adobe.xdp+xml");
     private static final MediaType XMP_MEDIA_TYPE = MediaType.application("rdf+xml");
     final List<IOException> exceptions = new ArrayList<>();
@@ -297,23 +299,37 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             return;
         }
 
-        Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
-        //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
-        //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
-        //If there is a need we could add a fully recursive search to find a non-null
-        //Map<String, COSObjectable> that contains the doc info.
-        if (embeddedFileNames != null) {
-            processEmbeddedDocNames(embeddedFileNames);
-        } else {
-            List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
-            if (kids == null) {
-                return;
+        Map<String, PDComplexFileSpecification> embeddedFileNames = new HashMap<>();
+        int depth = 0;
+        //recursively find embedded files
+        extractFilesfromEFTree(efTree, embeddedFileNames, depth);
+        processEmbeddedDocNames(embeddedFileNames);
+
+    }
+
+    private void extractFilesfromEFTree(PDNameTreeNode efTree, Map<String,
+            PDComplexFileSpecification> embeddedFileNames, int depth) throws IOException {
+        if (depth > MAX_RECURSION_DEPTH) {
+            throw new IOException("Hit max recursion depth");
+        }
+        Map<String, PDComplexFileSpecification> names = null;
+        try {
+            names = efTree.getNames();
+        } catch (IOException e) {
+            //LOG?
+        }
+        if (names != null) {
+            for (Map.Entry<String, PDComplexFileSpecification> e : names.entrySet()) {
+                embeddedFileNames.put(e.getKey(), e.getValue());
             }
+        }
+
+        List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
+        if (kids == null) {
+            return;
+        } else {
             for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
-                embeddedFileNames = node.getNames();
-                if (embeddedFileNames != null) {
-                    processEmbeddedDocNames(embeddedFileNames);
-                }
+                extractFilesfromEFTree(node, embeddedFileNames, depth+1);
             }
         }
     }
@@ -843,7 +859,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private void processAcroField(PDField field, final int currentRecursiveDepth)
             throws SAXException, IOException, TikaException {
 
-        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
+        if (currentRecursiveDepth >= MAX_RECURSION_DEPTH) {
             return;
         }
 
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 251a4b7..b0990a3 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1334,4 +1334,13 @@ public class PDFParserTest extends TikaTest {
             return true;
         }
     }
+
+    @Test
+    public void testDeeplyEmbeddedAttachments() throws Exception {
+        //test file comes from pdfcpu issue #120: https://github.com/pdfcpu/pdfcpu/issues/201
+        //in our regression corpus: pdfcpu-201-0.zip-0.pdf");
+        List<Metadata> metadataList = getRecursiveMetadata(
+                "testPDF_deeplyEmbeddedAttachments.pdf");
+        assertEquals(21, metadataList.size());
+    }
 }
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf
new file mode 100644
index 0000000..7df6d14
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf differ

[tika] 01/02: clean up dependencies

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 33a4f4214e4d033ef78ed238c4ca7a6ab4dbcdff
Author: tallison <ta...@apache.org>
AuthorDate: Mon Mar 22 14:52:32 2021 -0400

    clean up dependencies
---
 tika-parent/pom.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index ee5de60..474a277 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -267,6 +267,7 @@
     <maven.antrun.version>1.8</maven.antrun.version>
     <maven.assembly.version>3.3.0</maven.assembly.version>
     <maven.bundle.version>5.1.1</maven.bundle.version>
+    <maven.compiler.plugin.version>3.8.1</maven.compiler.plugin.version>
     <maven.failsafe.version>2.22.2</maven.failsafe.version>
     <maven.javadoc.version>3.1.1</maven.javadoc.version>
     <maven.scr.version>1.26.4</maven.scr.version>
@@ -423,8 +424,9 @@
   <build>
     <plugins>
       <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-compiler-plugin</artifactId>
-        <version>3.8.0</version>
+        <version>${maven.compiler.plugin.version}</version>
         <configuration>
           <source>${maven.compiler.source}</source>
           <target>${maven.compiler.target}</target>