You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/20 14:37:16 UTC
[tika] branch main updated: TIKA-3359 -- extract rich media from
PDFs -- broaded the search for /EF
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e995fe2 TIKA-3359 -- extract rich media from PDFs -- broaded the search for /EF
e995fe2 is described below
commit e995fe2b60e5d837d6c41be6e7c98bffedc521e7
Author: tallison <ta...@apache.org>
AuthorDate: Tue Apr 20 10:36:47 2021 -0400
TIKA-3359 -- extract rich media from PDFs -- broaded the search for /EF
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 35 +++++----
.../org/apache/tika/parser/pdf/PDFDOMUtil.java | 82 ++++++++++++++++++++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +
3 files changed, 105 insertions(+), 14 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 65ce5e7..5a6d9ab 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -47,7 +47,6 @@ import java.util.TreeMap;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.io.IOUtils;
-import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
@@ -522,20 +521,17 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
} else if (annotation instanceof PDAnnotationWidget) {
handleWidget((PDAnnotationWidget) annotation);
- } else if ("RichMedia".equals(annotation.getSubtype())) {
- COSArray array = (COSArray) annotation.getCOSObject().getObjectFromPath(
- "RichMediaContent/Assets/Names/");
- if (array == null || array.size() < 2) {
- //should log
- continue;
+ } else {
+ String annotationType = annotation.getSubtype();
+ if (annotationType == null) {
+ annotationType = "unknown";
+ }
+ for (COSDictionary fileSpec :
+ findFileSpecs(annotation.getCOSObject())) {
+ PDComplexFileSpecification cfs = new PDComplexFileSpecification(fileSpec);
+ handlePDComplexFileSpec(cfs.getFilename(),
+ annotationType, cfs);
}
- String name = array.getString(0);
- COSDictionary filespec = (COSDictionary) array.getObject(1);
- PDComplexFileSpecification cfs = new PDComplexFileSpecification(filespec);
- //TODO: do we want to tag this as a rich media type attachment
- //in the embedded file's metadata at some point?
- handlePDComplexFileSpec(name,
- "annotationRichMedia", cfs);
}
// TODO: remove once PDFBOX-1143 is fixed:
if (config.isExtractAnnotationText()) {
@@ -621,6 +617,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ private List<COSDictionary> findFileSpecs(COSDictionary cosDict) {
+ Set<COSName> types = new HashSet<>();
+ types.add(COSName.FILESPEC);
+ return PDFDOMUtil.findType(cosDict, types, MAX_RECURSION_DEPTH);
+ }
+
private void handlePDComplexFileSpec(String attachmentName,
String annotationType,
PDComplexFileSpecification fileSpec) throws IOException {
@@ -733,6 +735,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
+ /*} else if (action instanceof PDActionSubmitForm) {
+ PDActionSubmitForm submitForm = (PDActionSubmitForm) action;
+ //these are typically urls, not actual file specification
+ PDFileSpecification fileSpecification = submitForm.getFile();
+ processDoc("", fileSpecification, new AttributesImpl());*/
} else {
xhtml.startElement("div", attributes);
xhtml.endElement("div");
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFDOMUtil.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFDOMUtil.java
new file mode 100644
index 0000000..e8b9b2d
--- /dev/null
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFDOMUtil.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSBase;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSObject;
+
+class PDFDOMUtil {
+
+ /**
+ * This recursively looks through cosBase for cosdictionary's that have
+ * a type key in the types set. It intentionally does not follow /p, /parent, /page
+ * dictionaries.
+ *
+ * @param cosBase
+ * @param types
+ * @param maxDepth
+ * @return
+ */
+ static List<COSDictionary> findType(COSBase cosBase, Set<COSName> types, int maxDepth) {
+ List<COSDictionary> found = new ArrayList<>();
+ Set<COSBase> seen = new HashSet<>();
+ find(cosBase, types, 0, maxDepth, seen, found);
+ return found;
+ }
+
+ private static void find(COSBase cosBase, Set<COSName> types, int depth, int maxDepth,
+ Set<COSBase> seen, List<COSDictionary> found) {
+ if (seen.contains(cosBase)) {
+ return;
+ }
+ if (depth >= maxDepth) {
+ return;
+ }
+ seen.add(cosBase);
+ if (cosBase instanceof COSObject) {
+ COSBase dereferencedBase = ((COSObject)cosBase).getObject();
+ find(dereferencedBase, types, depth + 1, maxDepth, seen, found);
+ } else if (cosBase instanceof COSDictionary) {
+ COSDictionary dict = (COSDictionary)cosBase;
+ COSName value = dict.getCOSName(COSName.TYPE);
+ if (value != null && types.contains(value)) {
+ found.add(dict);
+ } else if (value != null && (value.equals(COSName.P) || value.equals(COSName.PAGE)
+ || value.equals(COSName.PARENT))) {
+ //don't descend page, p, or parent
+ return;
+ } else {
+ for (Map.Entry<COSName, COSBase> e : dict.entrySet()) {
+ find(e.getValue(), types, depth + 1, maxDepth, seen, found);
+ }
+ }
+ } else if (cosBase instanceof COSArray) {
+ for (COSBase item : ((COSArray)cosBase)) {
+ find(item, types, depth + 1, maxDepth, seen, found);
+ }
+ }
+ }
+}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 18265cc..e83083f 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1350,6 +1350,8 @@ public class PDFParserTest extends TikaTest {
List<Metadata> metadata = getRecursiveMetadata("testFlashInPDF.pdf");
assertEquals(2, metadata.size());
assertEquals("application/x-shockwave-flash", metadata.get(1).get(Metadata.CONTENT_TYPE));
+ assertEquals("TestMovie02.swf", metadata.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("15036", metadata.get(1).get(Metadata.CONTENT_LENGTH));
}
}