You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/09/30 03:41:21 UTC
svn commit: r1628350 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
test/java/org/apache/tika/parser/pdf/PDFParserTest.java
test/resources/test-documents/testPDFFileEmbInAnnotation.pdf
Author: tallison
Date: Tue Sep 30 01:41:20 2014
New Revision: 1628350
URL: http://svn.apache.org/r1628350
Log:
TIKA-1433 : extract documents embedded within annotations in PDFs
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1628350&r1=1628349&r2=1628350&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Tue Sep 30 01:41:20 2014
@@ -48,6 +48,8 @@ import org.apache.pdfbox.pdmodel.graphic
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
@@ -232,14 +234,27 @@ class PDF2XHTML extends PDFTextStripper
extractImages(page.getResources());
- // TODO: remove once PDFBOX-1143 is fixed:
- if (config.getExtractAnnotationText()) {
- for(Object o : page.getAnnotations()) {
- if( o instanceof PDAnnotationLink ) {
- PDAnnotationLink annotationlink = (PDAnnotationLink) o;
- if (annotationlink.getAction() != null) {
+ EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
+ for (PDAnnotation annotation : page.getAnnotations()) {
+
+ if (annotation instanceof PDAnnotationFileAttachment){
+ PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
+ PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
+ try {
+ extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
+ } catch (TikaException e) {
+ throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
+ }
+ }
+ // TODO: remove once PDFBOX-1143 is fixed:
+ if (config.getExtractAnnotationText()) {
+ if (annotation instanceof PDAnnotationLink) {
+ PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
+ if (annotationlink.getAction() != null) {
PDAction action = annotationlink.getAction();
- if( action instanceof PDActionURI ) {
+ if (action instanceof PDActionURI) {
PDActionURI uri = (PDActionURI) action;
String link = uri.getURI();
if (link != null) {
@@ -248,16 +263,16 @@ class PDF2XHTML extends PDFTextStripper
handler.endElement("a");
handler.endElement("div");
}
- }
+ }
}
}
- if (o instanceof PDAnnotationMarkup) {
- PDAnnotationMarkup annot = (PDAnnotationMarkup) o;
- String title = annot.getTitlePopup();
- String subject = annot.getSubject();
- String contents = annot.getContents();
- // TODO: maybe also annot.getRichContents()?
+ if (annotation instanceof PDAnnotationMarkup) {
+ PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
+ String title = annotationMarkup.getTitlePopup();
+ String subject = annotationMarkup.getSubject();
+ String contents = annotationMarkup.getContents();
+ // TODO: maybe also annotationMarkup.getRichContents()?
if (title != null || subject != null || contents != null) {
handler.startElement("div", "class", "annotation");
@@ -476,23 +491,22 @@ class PDF2XHTML extends PDFTextStripper
EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) {
PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
- if (spec == null) {
- //skip silently
- continue;
- }
- PDEmbeddedFile file = spec.getEmbeddedFile();
- if (file == null) {
- //skip silently
- continue;
- }
+ extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
+ }
+ }
- //current strategy is to pull all, not just first non-null
- extractPDEmbeddedFile(ent.getKey(), spec.getFile(), spec.getEmbeddedFile(), extractor);
- extractPDEmbeddedFile(ent.getKey(), spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
- extractPDEmbeddedFile(ent.getKey(), spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
- extractPDEmbeddedFile(ent.getKey(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+ private void extractMultiOSPDEmbeddedFiles(String defaultName,
+ PDComplexFileSpecification spec, EmbeddedDocumentExtractor extractor) throws IOException,
+ SAXException, TikaException {
+ if (spec == null) {
+ return;
}
+ //current strategy is to pull all, not just first non-null
+ extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor);
+ extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
+ extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
+ extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
}
private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1628350&r1=1628349&r2=1628350&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Tue Sep 30 01:41:20 2014
@@ -704,6 +704,13 @@ public class PDFParserTest extends TikaT
@Test
+ public void testEmbeddedFilesInAnnotations() throws Exception {
+ String xml = getXML("/testPDFFileEmbInAnnotation.pdf").xml;
+
+ assertTrue(xml.contains("This is a Excel"));
+ }
+
+ @Test
public void testSingleCloseDoc() throws Exception {
//TIKA-1341
InputStream is = PDFParserTest.class.getResourceAsStream(
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf?rev=1628350&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream