You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/09/30 03:41:21 UTC

svn commit: r1628350 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pdf/PDF2XHTML.java test/java/org/apache/tika/parser/pdf/PDFParserTest.java test/resources/test-documents/testPDFFileEmbInAnnotation.pdf

Author: tallison
Date: Tue Sep 30 01:41:20 2014
New Revision: 1628350

URL: http://svn.apache.org/r1628350
Log:
TIKA-1433 : extract documents embedded within annotations in PDFs

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1628350&r1=1628349&r2=1628350&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Tue Sep 30 01:41:20 2014
@@ -48,6 +48,8 @@ import org.apache.pdfbox.pdmodel.graphic
 import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
 import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
 import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
 import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
@@ -232,14 +234,27 @@ class PDF2XHTML extends PDFTextStripper 
 
             extractImages(page.getResources());
 
-            // TODO: remove once PDFBOX-1143 is fixed:
-            if (config.getExtractAnnotationText()) {
-                for(Object o : page.getAnnotations()) {
-                    if( o instanceof PDAnnotationLink ) {
-                        PDAnnotationLink annotationlink = (PDAnnotationLink) o;
-                        if (annotationlink.getAction()  != null) {
+            EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
+            for (PDAnnotation annotation : page.getAnnotations()) {
+
+                if (annotation instanceof PDAnnotationFileAttachment){
+                    PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
+                    PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
+                    try {
+                        extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
+                    } catch (SAXException e) {
+                        throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
+                    } catch (TikaException e) {
+                        throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
+                    }
+                }
+                // TODO: remove once PDFBOX-1143 is fixed:
+                if (config.getExtractAnnotationText()) {
+                    if (annotation instanceof PDAnnotationLink) {
+                        PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
+                        if (annotationlink.getAction() != null) {
                             PDAction action = annotationlink.getAction();
-                            if( action instanceof PDActionURI ) {
+                            if (action instanceof PDActionURI) {
                                 PDActionURI uri = (PDActionURI) action;
                                 String link = uri.getURI();
                                 if (link != null) {
@@ -248,16 +263,16 @@ class PDF2XHTML extends PDFTextStripper 
                                     handler.endElement("a");
                                     handler.endElement("div");
                                 }
-                             }
+                            }
                         }
                     }
 
-                    if (o instanceof PDAnnotationMarkup) {
-                        PDAnnotationMarkup annot = (PDAnnotationMarkup) o;
-                        String title = annot.getTitlePopup();
-                        String subject = annot.getSubject();
-                        String contents = annot.getContents();
-                        // TODO: maybe also annot.getRichContents()?
+                    if (annotation instanceof PDAnnotationMarkup) {
+                        PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
+                        String title = annotationMarkup.getTitlePopup();
+                        String subject = annotationMarkup.getSubject();
+                        String contents = annotationMarkup.getContents();
+                        // TODO: maybe also annotationMarkup.getRichContents()?
                         if (title != null || subject != null || contents != null) {
                             handler.startElement("div", "class", "annotation");
 
@@ -476,23 +491,22 @@ class PDF2XHTML extends PDFTextStripper 
         EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
         for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) {
             PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
-            if (spec == null) {
-                //skip silently
-                continue;
-            }
-            PDEmbeddedFile file = spec.getEmbeddedFile();
-            if (file == null) {
-                //skip silently
-                continue;
-            }
+            extractMultiOSPDEmbeddedFiles(ent.getKey(), spec, extractor);
+        }
+    }
 
-            //current strategy is to pull all, not just first non-null                
-            extractPDEmbeddedFile(ent.getKey(), spec.getFile(), spec.getEmbeddedFile(), extractor);
-            extractPDEmbeddedFile(ent.getKey(), spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
-            extractPDEmbeddedFile(ent.getKey(), spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
-            extractPDEmbeddedFile(ent.getKey(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+    private void extractMultiOSPDEmbeddedFiles(String defaultName,
+        PDComplexFileSpecification spec, EmbeddedDocumentExtractor extractor) throws IOException,
+            SAXException, TikaException {
 
+        if (spec == null) {
+            return;
         }
+        //current strategy is to pull all, not just first non-null
+        extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
+        extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
     }
 
     private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1628350&r1=1628349&r2=1628350&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Tue Sep 30 01:41:20 2014
@@ -704,6 +704,13 @@ public class PDFParserTest extends TikaT
 
 
     @Test
+    public void testEmbeddedFilesInAnnotations() throws Exception {
+        String xml = getXML("/testPDFFileEmbInAnnotation.pdf").xml;
+
+        assertTrue(xml.contains("This is a Excel"));
+    }
+
+    @Test
     public void testSingleCloseDoc() throws Exception {
         //TIKA-1341
         InputStream is = PDFParserTest.class.getResourceAsStream(

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf?rev=1628350&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFFileEmbInAnnotation.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream