You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/04/16 13:58:15 UTC
[tika] branch main updated: TIKA-3359 -- extract rich media from
PDFs
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 601cfff TIKA-3359 -- extract rich media from PDFs
601cfff is described below
commit 601cfff8762e0bf69a6e08f2cdf09590a6dc311b
Author: tallison <ta...@apache.org>
AuthorDate: Fri Apr 16 09:57:45 2021 -0400
TIKA-3359 -- extract rich media from PDFs
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 53 +++++++++++++++------
.../org/apache/tika/parser/pdf/PDFParserTest.java | 11 ++++-
.../resources/test-documents/testFlashInPDF.pdf | Bin 0 -> 161445 bytes
3 files changed, 48 insertions(+), 16 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index ed3b7c0..65ce5e7 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -47,6 +47,8 @@ import java.util.TreeMap;
import javax.xml.stream.XMLStreamException;
import org.apache.commons.io.IOUtils;
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
@@ -514,23 +516,26 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (annotation instanceof PDAnnotationFileAttachment) {
PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
if (fann.getFile() instanceof PDComplexFileSpecification) {
- PDComplexFileSpecification fileSpec =
- (PDComplexFileSpecification) fann.getFile();
- try {
- AttributesImpl attributes = new AttributesImpl();
- attributes.addAttribute("", "source", "source", "CDATA", "annotation");
- extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec,
- attributes);
- } catch (SAXException e) {
- throw new IOException("file embedded in annotation sax exception", e);
- } catch (TikaException e) {
- throw new IOException("file embedded in annotation tika exception", e);
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
+ handlePDComplexFileSpec(fann.getAttachmentName(),
+ "annotationFileAttachment",
+ (PDComplexFileSpecification)fann.getFile());
}
} else if (annotation instanceof PDAnnotationWidget) {
handleWidget((PDAnnotationWidget) annotation);
+ } else if ("RichMedia".equals(annotation.getSubtype())) {
+ COSArray array = (COSArray) annotation.getCOSObject().getObjectFromPath(
+ "RichMediaContent/Assets/Names/");
+ if (array == null || array.size() < 2) {
+ //should log
+ continue;
+ }
+ String name = array.getString(0);
+ COSDictionary filespec = (COSDictionary) array.getObject(1);
+ PDComplexFileSpecification cfs = new PDComplexFileSpecification(filespec);
+ //TODO: do we want to tag this as a rich media type attachment
+ //in the embedded file's metadata at some point?
+ handlePDComplexFileSpec(name,
+ "annotationRichMedia", cfs);
}
// TODO: remove once PDFBOX-1143 is fixed:
if (config.isExtractAnnotationText()) {
@@ -604,7 +609,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
if (config.isExtractFontNames()) {
-
for (COSName n : page.getResources().getFontNames()) {
PDFont font = page.getResources().getFont(n);
if (font != null && font.getFontDescriptor() != null) {
@@ -617,6 +621,25 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ private void handlePDComplexFileSpec(String attachmentName,
+ String annotationType,
+ PDComplexFileSpecification fileSpec) throws IOException {
+ try {
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "source", "source", "CDATA", annotationType);
+ extractMultiOSPDEmbeddedFiles(attachmentName, fileSpec,
+ attributes);
+ } catch (SAXException e) {
+ throw new IOException("file embedded in annotation sax exception", e);
+ } catch (TikaException e) {
+ throw new IOException("file embedded in annotation tika exception", e);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+
+ }
+
+
private void handleWidget(PDAnnotationWidget widget)
throws TikaException, SAXException, IOException {
if (widget == null) {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index b0990a3..18265cc 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -808,7 +808,8 @@ public class PDFParserTest extends TikaTest {
//doc embedded inside an annotation
r = getXML("testPDFFileEmbInAnnotation.pdf");
- assertContains("<div source=\"annotation\" class=\"embedded\" id=\"Excel.xlsx\" />", r.xml);
+ assertContains("<div source=\"annotationFileAttachment\" class=\"embedded\" id=\"Excel" +
+ ".xlsx\" />", r.xml);
}
//Access checker tests
@@ -1343,4 +1344,12 @@ public class PDFParserTest extends TikaTest {
"testPDF_deeplyEmbeddedAttachments.pdf");
assertEquals(21, metadataList.size());
}
+
+ @Test
+ public void testEmbeddedRichMedia() throws Exception {
+ List<Metadata> metadata = getRecursiveMetadata("testFlashInPDF.pdf");
+ assertEquals(2, metadata.size());
+ assertEquals("application/x-shockwave-flash", metadata.get(1).get(Metadata.CONTENT_TYPE));
+ }
+
}
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testFlashInPDF.pdf b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testFlashInPDF.pdf
new file mode 100644
index 0000000..07f59bd
Binary files /dev/null and b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/test/resources/test-documents/testFlashInPDF.pdf differ