You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/01 00:47:35 UTC
tika git commit: TIKA-2090: Allow extraction of PDActions (including Javascript) from PDFs (TIKA-2090).

Repository: tika
Updated Branches:
  refs/heads/2.x 3d08da79f -> 300100fcb


 TIKA-2090: Allow extraction of PDActions (including Javascript) from PDFs (TIKA-2090).


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/300100fc
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/300100fc
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/300100fc

Branch: refs/heads/2.x
Commit: 300100fcb9a39e8997764e0d3b3ecd0d213c7824
Parents: 3d08da7
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 30 19:47:26 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 30 19:47:26 2016 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |   3 +
 .../main/java/org/apache/tika/metadata/PDF.java |   7 +
 .../tika/parser/pdf/AbstractPDF2XHTML.java      | 216 ++++++++++++++++---
 .../apache/tika/parser/pdf/PDFParserConfig.java |  32 ++-
 .../apache/tika/parser/pdf/PDFParserTest.java   |   8 +-
 5 files changed, 235 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/300100fc/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index d948af6..daba7df 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
 
 Release 1.15 -???
 
+  * Allow extraction of PDActions (including Javascript) from
+    PDFs (TIKA-2090).
+
   * Change default behavior in experimental .docx parser to ignore
     deleted text to align with .doc (TIKA-2187).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/300100fc/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 4123e73..c7ea7fc 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -64,4 +64,11 @@ public interface PDF {
     Property PDFAID_PART = Property.internalText(PDFAID_PREFIX+"part");
 
     Property IS_ENCRYPTED = Property.internalBoolean(PDF_PREFIX+"encrypted");
+
+    /**
+     * This specifies where an action or destination would be found/triggered
+     * in the document: on document open, before close, etc.
+     */
+    Property ACTION_TRIGGER = Property.internalText(PDF_PREFIX+"actionTrigger");
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/300100fc/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index c175138..8f4374e 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -25,6 +25,7 @@ import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.text.SimpleDateFormat;
@@ -43,15 +44,27 @@ import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
 import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
 import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
 import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
 import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
 import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDSimpleFileSpecification;
 import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionImportData;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionLaunch;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionRemoteGoTo;
 import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.action.PDAnnotationAdditionalActions;
+import org.apache.pdfbox.pdmodel.interactive.action.PDDocumentCatalogAdditionalActions;
+import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions;
+import org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
 import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
@@ -70,6 +83,7 @@ import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
@@ -82,6 +96,33 @@ import org.xml.sax.helpers.AttributesImpl;
 
 class AbstractPDF2XHTML extends PDFTextStripper {
 
+    enum ActionTrigger {
+        AFTER_DOCUMENT_PRINT,
+        AFTER_DOCUMENT_SAVE,
+        ANNOTATION_CURSOR_ENTERS,
+        ANNOTATION_CURSOR_EXIT,
+        ANNOTATION_LOSE_INPUT_FOCUS,
+        ANNOTATION_MOUSE_CLICK,
+        ANNOTATION_MOUSE_RELEASED,
+        ANNOTATION_PAGE_CLOSED,
+        ANNOTATION_PAGE_NO_LONGER_VISIBLE,
+        ANNOTATION_PAGE_OPENED,
+        ANNOTATION_PAGE_VISIBLE,
+        ANNOTATION_RECEIVES_FOCUS,
+        ANNOTATION_WIDGET,
+        BEFORE_DOCUMENT_CLOSE,
+        BEFORE_DOCUMENT_PRINT,
+        BEFORE_DOCUMENT_SAVE,
+        DOCUMENT_OPEN,
+        FORM_FIELD,
+        FORM_FIELD_FORMATTED,
+        FORM_FIELD_KEYSTROKE,
+        FORM_FIELD_RECALCULATE,
+        FORM_FIELD_VALUE_CHANGE,
+        PAGE_CLOSE,
+        PAGE_OPEN, BOOKMARK,
+    };
+
     /**
      * Maximum recursive depth during AcroForm processing.
      * Prevents theoretical AcroForm recursion bomb.
@@ -157,6 +198,20 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
+    private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException {
+        if (spec instanceof PDSimpleFileSpecification) {
+            attributes.addAttribute("", "class", "class", "CDATA", "linked");
+            attributes.addAttribute("", "id", "id", "CDATA", spec.getFile());
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+        } else if (spec instanceof  PDComplexFileSpecification){
+            if (attributes.getIndex("source") < 0) {
+                attributes.addAttribute("", "source", "source", "CDATA", "attachment");
+            }
+            extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification)spec, attributes);
+        }
+    }
+
     private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames)
             throws IOException, SAXException, TikaException {
         if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
@@ -164,27 +219,30 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
 
         for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
-            PDComplexFileSpecification spec = ent.getValue();
-            extractMultiOSPDEmbeddedFiles(ent.getKey(), spec);
+            processDoc(ent.getKey(), ent.getValue(), new AttributesImpl());
         }
     }
 
     private void extractMultiOSPDEmbeddedFiles(String displayName,
-                                       PDComplexFileSpecification spec) throws IOException,
+                                               PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException,
             SAXException, TikaException {
 
         if (spec == null) {
             return;
         }
         //current strategy is to pull all, not just first non-null
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile());
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac());
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos());
-        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix());
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+                spec.getFile(), spec.getEmbeddedFile(), attributes);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+                spec.getFileMac(), spec.getEmbeddedFileMac(), attributes);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+                spec.getFileDos(), spec.getEmbeddedFileDos(), attributes);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+                spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes);
     }
 
     private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
-                                       String fileName, PDEmbeddedFile file)
+                                       String fileName, PDEmbeddedFile file, AttributesImpl attributes)
             throws SAXException, IOException, TikaException {
 
         if (file == null) {
@@ -202,25 +260,15 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
         embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
-
         if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
             TikaInputStream stream = null;
             try {
-
-                InputStream rawStream = null;
-                try {
-                    rawStream = file.createInputStream();
-                } catch (IOException e) {
-                    EmbeddedDocumentUtil.recordException(e, metadata);
-                    return;
-                }
-                stream = TikaInputStream.get(rawStream);
+                stream = TikaInputStream.get(file.createInputStream());
                 embeddedDocumentExtractor.parseEmbedded(
                         stream,
                         new EmbeddedContentHandler(xhtml),
                         embeddedMetadata, false);
 
-                AttributesImpl attributes = new AttributesImpl();
                 attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                 attributes.addAttribute("", "id", "id", "CDATA", fileName);
                 xhtml.startElement("div", attributes);
@@ -239,6 +287,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                 // WriteOutContentHandler.WriteLimitReachedException?
                 throw e;
             }
+
             String msg = e.getMessage();
             if (msg == null) {
                 msg = "IOException, no message";
@@ -295,7 +344,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                     PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                     PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                     try {
-                        extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec);
+                        AttributesImpl attributes = new AttributesImpl();
+                        attributes.addAttribute("", "source", "source", "CDATA", "annotation");
+                        extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
                     } catch (SAXException e) {
                         throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                     } catch (TikaException e) {
@@ -303,6 +354,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                     } catch (IOException e) {
                         handleCatchableIOE(e);
                     }
+                } else if (annotation instanceof PDAnnotationWidget) {
+                    handleWidget((PDAnnotationWidget)annotation);
                 }
                 // TODO: remove once PDFBOX-1143 is fixed:
                 if (config.getExtractAnnotationText()) {
@@ -311,11 +364,15 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                         if (annotationlink.getAction() != null) {
                             PDAction action = annotationlink.getAction();
                             if (action instanceof PDActionURI) {
+                                //can't currently associate link to text.
+                                //for now, extract link and repeat the link as if it
+                                //were the visible text
                                 PDActionURI uri = (PDActionURI) action;
                                 String link = uri.getURI();
-                                if (link != null) {
+                                if (link != null && link.trim().length() > 0) {
                                     xhtml.startElement("div", "class", "annotation");
                                     xhtml.startElement("a", "href", link);
+                                    xhtml.characters(link);
                                     xhtml.endElement("a");
                                     xhtml.endElement("div");
                                 }
@@ -358,6 +415,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
                 doOCROnCurrentPage();
             }
+
+            PDPageAdditionalActions pageActions = page.getActions();
+            if (pageActions != null) {
+                handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
+                handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
+            }
             xhtml.endElement("div");
         } catch (SAXException|TikaException e) {
             throw new IOExceptionWithCause("Unable to end a page", e);
@@ -368,15 +431,93 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
+    private void handleWidget(PDAnnotationWidget widget) throws TikaException, SAXException, IOException {
+        if (widget == null) {
+            return;
+        }
+        handleDestinationOrAction(widget.getAction(), ActionTrigger.ANNOTATION_WIDGET);
+        PDAnnotationAdditionalActions annotationActions = widget.getActions();
+        if (annotationActions != null) {
+            handleDestinationOrAction(annotationActions.getBl(), ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
+            handleDestinationOrAction(annotationActions.getD(), ActionTrigger.ANNOTATION_MOUSE_CLICK);
+            handleDestinationOrAction(annotationActions.getE(), ActionTrigger.ANNOTATION_CURSOR_ENTERS);
+            handleDestinationOrAction(annotationActions.getFo(), ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
+            handleDestinationOrAction(annotationActions.getPC(), ActionTrigger.ANNOTATION_PAGE_CLOSED);
+            handleDestinationOrAction(annotationActions.getPI(), ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
+            handleDestinationOrAction(annotationActions.getPO(), ActionTrigger.ANNOTATION_PAGE_OPENED);
+            handleDestinationOrAction(annotationActions.getPV(), ActionTrigger.ANNOTATION_PAGE_VISIBLE);
+            handleDestinationOrAction(annotationActions.getU(), ActionTrigger.ANNOTATION_MOUSE_RELEASED);
+            handleDestinationOrAction(annotationActions.getX(), ActionTrigger.ANNOTATION_CURSOR_EXIT);
+        }
+
+    }
+
     @Override
     protected void startDocument(PDDocument pdf) throws IOException {
         try {
             xhtml.startDocument();
-        } catch (SAXException e) {
+            handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN);
+        } catch (TikaException|SAXException e) {
             throw new IOExceptionWithCause("Unable to start a document", e);
         }
     }
 
+    private void handleDestinationOrAction(PDDestinationOrAction action,
+                                           ActionTrigger actionTrigger) throws IOException, SAXException, TikaException {
+        if (action == null || ! config.getExtractActions()) {
+            return;
+        }
+        AttributesImpl attributes = new AttributesImpl();
+        String actionOrDestString = (action instanceof PDAction) ? "action" : "destination";
+
+        addNonNullAttribute("class",  actionOrDestString, attributes);
+        addNonNullAttribute("type", action.getClass().getSimpleName(), attributes);
+        addNonNullAttribute("trigger", actionTrigger.name(), attributes);
+
+        if (action instanceof PDActionImportData) {
+            processDoc("", ((PDActionImportData)action).getFile(), attributes);
+        } else if (action instanceof PDActionLaunch) {
+            PDActionLaunch pdActionLaunch = (PDActionLaunch)action;
+            addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
+            addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes);
+            addNonNullAttribute("operation", pdActionLaunch.getO(), attributes);
+            addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes);
+            processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes);
+        } else if (action instanceof PDActionRemoteGoTo) {
+            PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo)action;
+            processDoc("", remoteGoTo.getFile(), attributes);
+        } else if (action instanceof PDActionJavaScript) {
+            PDActionJavaScript jsAction = (PDActionJavaScript)action;
+            Metadata m = new Metadata();
+            m.set(Metadata.CONTENT_TYPE, "application/javascript");
+            m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString());
+            m.set(PDF.ACTION_TRIGGER, actionTrigger.toString());
+            m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name());
+            String js = jsAction.getAction();
+            js = (js == null) ? "" : js;
+            if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+                try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
+                    embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, false);
+                }
+            }
+            addNonNullAttribute("class", "javascript", attributes);
+            addNonNullAttribute("type", jsAction.getType(), attributes);
+            addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+        } else {
+            xhtml.startElement("div", attributes);
+            xhtml.endElement("div");
+        }
+    }
+
+    private static void addNonNullAttribute(String name, String value, AttributesImpl attributes) {
+        if (name == null || value == null) {
+            return;
+        }
+        attributes.addAttribute("", name, name, "CDATA", value);
+    }
+
     @Override
     protected void endDocument(PDDocument pdf) throws IOException {
         try {
@@ -396,6 +537,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                     handleCatchableIOE(e);
                 }
             }
+            PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions();
+            handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT);
+            handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE);
+            handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE);
+            handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT);
+            handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE);
             xhtml.endDocument();
         } catch (TikaException e) {
             throw new IOExceptionWithCause("Unable to end a document", e);
@@ -404,21 +551,23 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
-    void extractBookmarkText() throws SAXException {
+    void extractBookmarkText() throws SAXException, IOException, TikaException {
         PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
         if (outline != null) {
             extractBookmarkText(outline);
         }
     }
 
-    void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
+    void extractBookmarkText(PDOutlineNode bookmark) throws SAXException, IOException, TikaException {
         PDOutlineItem current = bookmark.getFirstChild();
+
         if (current != null) {
             xhtml.startElement("ul");
             while (current != null) {
                 xhtml.startElement("li");
                 xhtml.characters(current.getTitle());
                 xhtml.endElement("li");
+                handleDestinationOrAction(current.getAction(), ActionTrigger.BOOKMARK);
                 // Recurse:
                 extractBookmarkText(current);
                 current = current.getNextSibling();
@@ -428,7 +577,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     }
 
     void extractAcroForm(PDDocument pdf) throws IOException,
-            SAXException {
+            SAXException, TikaException {
         //Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
         //this code derives from Ben's code
         PDDocumentCatalog catalog = pdf.getDocumentCatalog();
@@ -484,11 +633,26 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     }
 
     private void processAcroField(PDField field, final int currentRecursiveDepth)
-            throws SAXException, IOException {
+            throws SAXException, IOException, TikaException {
 
         if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
             return;
         }
+
+        PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions();
+        if (pdFormFieldAdditionalActions != null) {
+            handleDestinationOrAction(pdFormFieldAdditionalActions.getC(), ActionTrigger.FORM_FIELD_RECALCULATE);
+            handleDestinationOrAction(pdFormFieldAdditionalActions.getF(), ActionTrigger.FORM_FIELD_FORMATTED);
+            handleDestinationOrAction(pdFormFieldAdditionalActions.getK(), ActionTrigger.FORM_FIELD_KEYSTROKE);
+            handleDestinationOrAction(pdFormFieldAdditionalActions.getV(), ActionTrigger.FORM_FIELD_VALUE_CHANGE);
+        }
+        if (field.getWidgets() != null) {
+            for (PDAnnotationWidget widget : field.getWidgets()) {
+                handleWidget(widget);
+            }
+        }
+
+
         addFieldString(field);
         if (field instanceof PDNonTerminalField) {
             int r = currentRecursiveDepth + 1;

http://git-wip-us.apache.org/repos/asf/tika/blob/300100fc/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index cf43864..cf6a373 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -115,6 +115,9 @@ public class PDFParserConfig implements Serializable {
     //and then throws the first stored exception after the parse has completed.
     private boolean catchIntermediateIOExceptions = true;
 
+    private boolean extractActions = false;
+
+
     public PDFParserConfig() {
         init(this.getClass().getResourceAsStream("PDFParser.properties"));
     }
@@ -178,6 +181,8 @@ public class PDFParserConfig implements Serializable {
                 getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
                 isCatchIntermediateIOExceptions()));
 
+        setExtractActions(getBooleanProp(props.getProperty("extractActions"), false));
+
         setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
 
         setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI()));
@@ -551,6 +556,25 @@ public class PDFParserConfig implements Serializable {
         return null;
     }
 
+    /**
+     * Whether or not to extract PDActions from the file.
+     * Most Action types are handled inline; javascript macros
+     * are processed as embedded documents.
+     *
+     * @param v
+     */
+    public void setExtractActions(boolean v) {
+        extractActions = v;
+    }
+
+    /**
+     * @see #setExtractActions(boolean)
+     * @return whether or not to extract PDActions
+     */
+    public boolean getExtractActions() {
+        return extractActions;
+    }
+
     @Override
     public boolean equals(Object o) {
         if (this == o) return true;
@@ -573,6 +597,8 @@ public class PDFParserConfig implements Serializable {
         if (!getOCRStrategy().equals(config.getOCRStrategy())) return false;
         if (getOCRImageType() != config.getOCRImageType()) return false;
         if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) return false;
+        if (getExtractActions() != config.getExtractActions()) return false;
+
         return getAccessChecker().equals(config.getAccessChecker());
 
     }
@@ -594,7 +620,8 @@ public class PDFParserConfig implements Serializable {
         result = 31 * result + getOCRImageType().hashCode();
         result = 31 * result + getOCRImageFormatName().hashCode();
         result = 31 * result + getAccessChecker().hashCode();
-        result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
+        result = 31 * result + (getCatchIntermediateIOExceptions() ? 1 : 0);
+        result = 31 * result + (getExtractActions() ? 1 : 0);
         return result;
     }
 
@@ -616,7 +643,8 @@ public class PDFParserConfig implements Serializable {
                 ", ocrImageType=" + ocrImageType +
                 ", ocrImageFormatName='" + ocrImageFormatName + '\'' +
                 ", accessChecker=" + accessChecker +
-                ", isCatchIntermediateIOExceptions=" + catchIntermediateIOExceptions +
+                ", extractActions=" + extractActions +
+                ", catchIntermediateIOExceptions=" + catchIntermediateIOExceptions +
                 '}';
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/300100fc/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 2292157..f76aea7 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -394,7 +394,8 @@ public class PDFParserTest extends TikaTest {
     @Test
     public void testLinks() throws Exception {
         final XMLResult result = getXML("testPDFVarious.pdf");
-        assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\" /></div>", result.xml);
+        assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\">"+
+                "http://tika.apache.org/</a></div>", result.xml);
     }
 
     @Test
@@ -893,13 +894,14 @@ public class PDFParserTest extends TikaTest {
 
         XMLResult r = getXML("testPDF_childAttachments.pdf", context);
         //regular attachment
-        assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", r.xml);
+        assertContains("<div source=\"attachment\" class=\"embedded\" id=\"Unit10.doc\" />", r.xml);
         //inline image
         assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", r.xml);
 
         //doc embedded inside an annotation
         r = getXML("testPDFFileEmbInAnnotation.pdf");
-        assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", r.xml);
+        assertContains("<div source=\"annotation\" class=\"embedded\" id=\"Excel.xlsx\" />", r.xml);
+
     }
 
     //Access checker tests