You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/20 14:37:16 UTC

[tika] branch main updated: TIKA-3359 -- extract rich media from PDFs -- broaded the search for /EF

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new e995fe2  TIKA-3359 -- extract rich media from PDFs -- broaded the search for /EF
e995fe2 is described below

commit e995fe2b60e5d837d6c41be6e7c98bffedc521e7
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 20 10:36:47 2021 -0400

    TIKA-3359 -- extract rich media from PDFs -- broaded the search for /EF
---
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 35 +++++----
 .../org/apache/tika/parser/pdf/PDFDOMUtil.java     | 82 ++++++++++++++++++++++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  2 +
 3 files changed, 105 insertions(+), 14 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 65ce5e7..5a6d9ab 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -47,7 +47,6 @@ import java.util.TreeMap;
 import javax.xml.stream.XMLStreamException;
 
 import org.apache.commons.io.IOUtils;
-import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdmodel.PDDocument;
@@ -522,20 +521,17 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                     }
                 } else if (annotation instanceof PDAnnotationWidget) {
                     handleWidget((PDAnnotationWidget) annotation);
-                } else if ("RichMedia".equals(annotation.getSubtype())) {
-                    COSArray array = (COSArray) annotation.getCOSObject().getObjectFromPath(
-                            "RichMediaContent/Assets/Names/");
-                    if (array == null || array.size() < 2) {
-                        //should log
-                        continue;
+                } else {
+                    String annotationType = annotation.getSubtype();
+                    if (annotationType == null) {
+                        annotationType = "unknown";
+                    }
+                    for (COSDictionary fileSpec :
+                            findFileSpecs(annotation.getCOSObject())) {
+                        PDComplexFileSpecification cfs = new PDComplexFileSpecification(fileSpec);
+                        handlePDComplexFileSpec(cfs.getFilename(),
+                                annotationType, cfs);
                     }
-                    String name = array.getString(0);
-                    COSDictionary filespec = (COSDictionary) array.getObject(1);
-                    PDComplexFileSpecification cfs = new PDComplexFileSpecification(filespec);
-                    //TODO: do we want to tag this as a rich media type attachment
-                    //in the embedded file's metadata at some point?
-                    handlePDComplexFileSpec(name,
-                            "annotationRichMedia", cfs);
                 }
                 // TODO: remove once PDFBOX-1143 is fixed:
                 if (config.isExtractAnnotationText()) {
@@ -621,6 +617,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
+    private List<COSDictionary> findFileSpecs(COSDictionary cosDict) {
+        Set<COSName> types = new HashSet<>();
+        types.add(COSName.FILESPEC);
+        return PDFDOMUtil.findType(cosDict, types, MAX_RECURSION_DEPTH);
+    }
+
     private void handlePDComplexFileSpec(String attachmentName,
                                          String annotationType,
                                          PDComplexFileSpecification fileSpec) throws IOException {
@@ -733,6 +735,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
             xhtml.startElement("div", attributes);
             xhtml.endElement("div");
+        /*} else if (action instanceof PDActionSubmitForm) {
+            PDActionSubmitForm submitForm = (PDActionSubmitForm) action;
+            //these are typically urls, not actual file specification
+            PDFileSpecification fileSpecification = submitForm.getFile();
+            processDoc("", fileSpecification, new AttributesImpl());*/
         } else {
             xhtml.startElement("div", attributes);
             xhtml.endElement("div");
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFDOMUtil.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFDOMUtil.java
new file mode 100644
index 0000000..e8b9b2d
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFDOMUtil.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+
+class PDFDOMUtil {
+
+    /**
+     * This recursively looks through cosBase for cosdictionary's that have
+     * a type key in the types set. It intentionally does not follow /p, /parent, /page
+     * dictionaries.
+     *
+     * @param cosBase
+     * @param types
+     * @param maxDepth
+     * @return
+     */
+    static List<COSDictionary> findType(COSBase cosBase, Set<COSName> types, int maxDepth) {
+        List<COSDictionary> found = new ArrayList<>();
+        Set<COSBase> seen = new HashSet<>();
+        find(cosBase, types, 0, maxDepth, seen, found);
+        return found;
+    }
+
+    private static void find(COSBase cosBase, Set<COSName> types, int depth, int maxDepth,
+                             Set<COSBase> seen, List<COSDictionary> found) {
+        if (seen.contains(cosBase)) {
+            return;
+        }
+        if (depth >= maxDepth) {
+            return;
+        }
+        seen.add(cosBase);
+        if (cosBase instanceof COSObject) {
+            COSBase dereferencedBase = ((COSObject)cosBase).getObject();
+            find(dereferencedBase, types, depth + 1, maxDepth, seen, found);
+        } else if (cosBase instanceof COSDictionary) {
+            COSDictionary dict = (COSDictionary)cosBase;
+            COSName value = dict.getCOSName(COSName.TYPE);
+            if (value != null && types.contains(value)) {
+                found.add(dict);
+            } else if (value != null && (value.equals(COSName.P) || value.equals(COSName.PAGE)
+                    || value.equals(COSName.PARENT))) {
+                //don't descend page, p, or parent
+                return;
+            } else {
+                for (Map.Entry<COSName, COSBase> e : dict.entrySet()) {
+                    find(e.getValue(), types, depth + 1, maxDepth, seen, found);
+                }
+            }
+        } else if (cosBase instanceof COSArray) {
+            for (COSBase item : ((COSArray)cosBase)) {
+                find(item, types, depth + 1, maxDepth, seen, found);
+            }
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 18265cc..e83083f 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1350,6 +1350,8 @@ public class PDFParserTest extends TikaTest {
         List<Metadata> metadata = getRecursiveMetadata("testFlashInPDF.pdf");
         assertEquals(2, metadata.size());
         assertEquals("application/x-shockwave-flash", metadata.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals("TestMovie02.swf", metadata.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertEquals("15036", metadata.get(1).get(Metadata.CONTENT_LENGTH));
     }
 
 }