You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/22 18:17:53 UTC

[tika] branch branch_1x updated (6a27f3e -> b63072c)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 6a27f3e  TIKA-3244: update spring
     new 8bf65c0  TIKA-3332 -- recursively process the embedded file tree in PDFs.
     new b63072c  Merge remote-tracking branch 'origin/branch_1x' into branch_1x

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |   3 +
 .../src/test/java/org/apache/tika/TikaTest.java    |  15 +++--
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  70 +++++++++++++--------
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  11 ++++
 .../testPDF_deeplyEmbeddedAttachments.pdf          | Bin 0 -> 122221 bytes
 5 files changed, 67 insertions(+), 32 deletions(-)
 create mode 100644 tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf

[tika] 01/02: TIKA-3332 -- recursively process the embedded file tree in PDFs.

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 8bf65c01e174e1ba872e813089b076f21ddb4410
Author: tballison <ta...@apache.org>
AuthorDate: Mon Mar 22 14:17:35 2021 -0400

    TIKA-3332 -- recursively process the embedded file tree in PDFs.
---
 CHANGES.txt                                        |   3 +
 .../src/test/java/org/apache/tika/TikaTest.java    |  15 +++--
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  70 +++++++++++++--------
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  11 ++++
 .../testPDF_deeplyEmbeddedAttachments.pdf          | Bin 0 -> 122221 bytes
 5 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 2b9089c..bcc7c5d 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.26 - 03/09/2021
 
+   * Extract more embedded files in PDFs by recursively processing the
+     embedded file tree (TIKA-3332).
+
    * Allow for case insensitive headers for configuration of the PDFParser
      and the TesseractOCRParser in tika-server via Subhajit Das (TIKA-3320).
 
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index e21f752..2d0083c 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -247,19 +247,22 @@ public abstract class TikaTest {
     }
 
     protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context, boolean suppressException) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(path)) {
-            return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, new Metadata(), suppressException);
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+            return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, context, metadata, suppressException);
         }
     }
     protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, boolean suppressException) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(path)) {
-            return getRecursiveMetadata(tis, parser, new ParseContext(), new Metadata(), suppressException);
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+            return getRecursiveMetadata(tis, parser, new ParseContext(), metadata, suppressException);
         }
     }
 
     protected List<Metadata> getRecursiveMetadata(Path p, boolean suppressException) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(p)) {
-            return getRecursiveMetadata(tis, new ParseContext(), new Metadata(), suppressException);
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(p, metadata)) {
+            return getRecursiveMetadata(tis, new ParseContext(), metadata, suppressException);
         }
     }
     protected List<Metadata> getRecursiveMetadata(Path filePath) throws Exception {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 43526ef..f930c61 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -34,6 +34,7 @@ import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.ListIterator;
@@ -136,10 +137,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     };
 
     /**
-     * Maximum recursive depth during AcroForm processing.
-     * Prevents theoretical AcroForm recursion bomb.
+     * Maximum recursive depth to prevent cycles/recursion bombs.
+     * This applies to AcroForm processing and processing
+     * the embedded document tree.
      */
-    private final static int MAX_ACROFORM_RECURSIONS = 10;
+    private final static int MAX_RECURSION_DEPTH = 100;
 
     private final static TesseractOCRConfig DEFAULT_TESSERACT_CONFIG = new TesseractOCRConfig();
 
@@ -287,32 +289,48 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
     private void extractEmbeddedDocuments(PDDocument document)
             throws IOException, SAXException, TikaException {
-            PDDocumentNameDictionary namesDictionary =
-                    new PDDocumentNameDictionary(document.getDocumentCatalog());
-            PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
-            if (efTree == null) {
-                return;
+        PDDocumentNameDictionary namesDictionary =
+                new PDDocumentNameDictionary(document.getDocumentCatalog());
+        PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles();
+
+        if (efTree == null) {
+            return;
+        }
+
+        //Set<COSObjectKey> seen = new HashSet<>();
+
+        Map<String, PDComplexFileSpecification> embeddedFileNames = new HashMap<>();
+        int depth = 0;
+        //recursively find embedded files
+        extractFilesfromEFTree(efTree, embeddedFileNames, depth);
+        processEmbeddedDocNames(embeddedFileNames);
+    }
+
+    private void extractFilesfromEFTree(PDNameTreeNode efTree, Map<String,
+            PDComplexFileSpecification> embeddedFileNames, int depth) throws IOException {
+        if (depth > MAX_RECURSION_DEPTH) {
+            throw new IOException("Hit max recursion depth");
+        }
+        Map<String, PDComplexFileSpecification> names = null;
+        try {
+            names = efTree.getNames();
+        } catch (IOException e) {
+            //LOG?
+        }
+        if (names != null) {
+            for (Map.Entry<String, PDComplexFileSpecification> e : names.entrySet()) {
+                embeddedFileNames.put(e.getKey(), e.getValue());
             }
+        }
 
-        Map<String, PDComplexFileSpecification> embeddedFileNames = efTree.getNames();
-        //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
-        //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
-        //If there is a need we could add a fully recursive search to find a non-null
-        //Map<String, COSObjectable> that contains the doc info.
-        if (embeddedFileNames != null) {
-            processEmbeddedDocNames(embeddedFileNames);
+        List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
+        if (kids == null) {
+            return;
         } else {
-            List<PDNameTreeNode<PDComplexFileSpecification>> kids = efTree.getKids();
-            if (kids == null) {
-                return;
-            }
             for (PDNameTreeNode<PDComplexFileSpecification> node : kids) {
-                embeddedFileNames = node.getNames();
-                if (embeddedFileNames != null) {
-                    processEmbeddedDocNames(embeddedFileNames);
-                }
+                extractFilesfromEFTree(node, embeddedFileNames, depth+1);
             }
-        }
+       }
     }
 
     private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException {
@@ -803,8 +821,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private void processAcroField(PDField field, final int currentRecursiveDepth)
             throws SAXException, IOException, TikaException {
 
-        if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
-            return;
+        if (currentRecursiveDepth >= MAX_RECURSION_DEPTH) {
+            throw new IOException("Hit max recursion depth.");
         }
 
         PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions();
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 4ad2b12..c009f06 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -25,8 +25,10 @@ import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 import static org.junit.Assume.assumeTrue;
 
+import java.io.File;
 import java.io.InputStream;
 import java.nio.file.Path;
+import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -1649,4 +1651,13 @@ public class PDFParserTest extends TikaTest {
             return true;
         }
     }
+
+    @Test
+    public void testDeeplyEmbeddedAttachments() throws Exception {
+        //test file comes from pdfcpu issue #120: https://github.com/pdfcpu/pdfcpu/issues/201
+        //in our regression corpus: pdfcpu-201-0.zip-0.pdf");
+        List<Metadata> metadataList = getRecursiveMetadata(
+                "testPDF_deeplyEmbeddedAttachments.pdf");
+        assertEquals(21, metadataList.size());
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf b/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf
new file mode 100644
index 0000000..7df6d14
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPDF_deeplyEmbeddedAttachments.pdf differ

[tika] 02/02: Merge remote-tracking branch 'origin/branch_1x' into branch_1x

Posted by ta...@apache.org.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit b63072c540e19a68fd2c01823c48bce225e3e619
Merge: 8bf65c0 6a27f3e
Author: tballison <ta...@apache.org>
AuthorDate: Mon Mar 22 14:17:41 2021 -0400

    Merge remote-tracking branch 'origin/branch_1x' into branch_1x

 CHANGES.txt                                        |   7 +-
 .../src/main/java/org/apache/tika/gui/TikaGUI.java |   3 -
 .../apache/tika/parser/RecursiveParserWrapper.java |  79 ++++++++++++++++----
 .../tika/sax/BasicContentHandlerFactory.java       |   3 +
 .../org/apache/tika/sax/SecureContentHandler.java  |   2 +-
 tika-eval/pom.xml                                  |  35 ++++++++-
 .../java/org/apache/tika/eval/FileProfiler.java    |   2 +
 .../resources/tika-eval-file-profiler-config.xml   |   9 ++-
 tika-example/pom.xml                               |   2 +-
 tika-parsers/pom.xml                               |   2 +-
 .../parser/odf/OpenDocumentManifestHandler.java    |  46 ++++++++++++
 .../apache/tika/parser/odf/OpenDocumentParser.java |  82 ++++++++++++++++-----
 .../org/apache/tika/parser/odf/ODFParserTest.java  |  29 ++++++++
 .../resources/test-documents/testODTEncrypted.odt  | Bin 0 -> 12714 bytes
 .../tika/server/RecursiveMetadataResourceTest.java |  24 +++---
 15 files changed, 262 insertions(+), 63 deletions(-)

diff --cc CHANGES.txt
index bcc7c5d,84884ed..37dcabe
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@@ -1,8 -1,10 +1,13 @@@
- Release 1.26 - 03/09/2021
+ Release 1.26 - ??/??/????
+ 
+    * The "writeLimit" header now pertains to the combined characters
+      written per container document (and embedded documents) in the /rmeta
+      endpoint in tika-server (TIKA-3325); it no longer functions only
+      per container or embedded document.
  
 +   * Extract more embedded files in PDFs by recursively processing the
 +     embedded file tree (TIKA-3332).
 +
     * Allow for case insensitive headers for configuration of the PDFParser
       and the TesseractOCRParser in tika-server via Subhajit Das (TIKA-3320).