You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/01 00:47:35 UTC
tika git commit: TIKA-2090: Allow extraction of PDActions (including
Javascript) from PDFs (TIKA-2090).
Repository: tika
Updated Branches:
refs/heads/2.x 3d08da79f -> 300100fcb
TIKA-2090: Allow extraction of PDActions (including Javascript) from PDFs (TIKA-2090).
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/300100fc
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/300100fc
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/300100fc
Branch: refs/heads/2.x
Commit: 300100fcb9a39e8997764e0d3b3ecd0d213c7824
Parents: 3d08da7
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 30 19:47:26 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 30 19:47:26 2016 -0500
----------------------------------------------------------------------
CHANGES.txt | 3 +
.../main/java/org/apache/tika/metadata/PDF.java | 7 +
.../tika/parser/pdf/AbstractPDF2XHTML.java | 216 ++++++++++++++++---
.../apache/tika/parser/pdf/PDFParserConfig.java | 32 ++-
.../apache/tika/parser/pdf/PDFParserTest.java | 8 +-
5 files changed, 235 insertions(+), 31 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/300100fc/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index d948af6..daba7df 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -17,6 +17,9 @@ Release 2.0 - ???
Release 1.15 -???
+ * Allow extraction of PDActions (including Javascript) from
+ PDFs (TIKA-2090).
+
* Change default behavior in experimental .docx parser to ignore
deleted text to align with .doc (TIKA-2187).
http://git-wip-us.apache.org/repos/asf/tika/blob/300100fc/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 4123e73..c7ea7fc 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -64,4 +64,11 @@ public interface PDF {
Property PDFAID_PART = Property.internalText(PDFAID_PREFIX+"part");
Property IS_ENCRYPTED = Property.internalBoolean(PDF_PREFIX+"encrypted");
+
+ /**
+ * This specifies where an action or destination would be found/triggered
+ * in the document: on document open, before close, etc.
+ */
+ Property ACTION_TRIGGER = Property.internalText(PDF_PREFIX+"actionTrigger");
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/300100fc/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index c175138..8f4374e 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -25,6 +25,7 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.text.SimpleDateFormat;
@@ -43,15 +44,27 @@ import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDDestinationOrAction;
import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDSimpleFileSpecification;
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionImportData;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionLaunch;
+import org.apache.pdfbox.pdmodel.interactive.action.PDActionRemoteGoTo;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.action.PDAnnotationAdditionalActions;
+import org.apache.pdfbox.pdmodel.interactive.action.PDDocumentCatalogAdditionalActions;
+import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions;
+import org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
@@ -70,6 +83,7 @@ import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
@@ -82,6 +96,33 @@ import org.xml.sax.helpers.AttributesImpl;
class AbstractPDF2XHTML extends PDFTextStripper {
+ enum ActionTrigger {
+ AFTER_DOCUMENT_PRINT,
+ AFTER_DOCUMENT_SAVE,
+ ANNOTATION_CURSOR_ENTERS,
+ ANNOTATION_CURSOR_EXIT,
+ ANNOTATION_LOSE_INPUT_FOCUS,
+ ANNOTATION_MOUSE_CLICK,
+ ANNOTATION_MOUSE_RELEASED,
+ ANNOTATION_PAGE_CLOSED,
+ ANNOTATION_PAGE_NO_LONGER_VISIBLE,
+ ANNOTATION_PAGE_OPENED,
+ ANNOTATION_PAGE_VISIBLE,
+ ANNOTATION_RECEIVES_FOCUS,
+ ANNOTATION_WIDGET,
+ BEFORE_DOCUMENT_CLOSE,
+ BEFORE_DOCUMENT_PRINT,
+ BEFORE_DOCUMENT_SAVE,
+ DOCUMENT_OPEN,
+ FORM_FIELD,
+ FORM_FIELD_FORMATTED,
+ FORM_FIELD_KEYSTROKE,
+ FORM_FIELD_RECALCULATE,
+ FORM_FIELD_VALUE_CHANGE,
+ PAGE_CLOSE,
+ PAGE_OPEN, BOOKMARK,
+ };
+
/**
* Maximum recursive depth during AcroForm processing.
* Prevents theoretical AcroForm recursion bomb.
@@ -157,6 +198,20 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException {
+ if (spec instanceof PDSimpleFileSpecification) {
+ attributes.addAttribute("", "class", "class", "CDATA", "linked");
+ attributes.addAttribute("", "id", "id", "CDATA", spec.getFile());
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ } else if (spec instanceof PDComplexFileSpecification){
+ if (attributes.getIndex("source") < 0) {
+ attributes.addAttribute("", "source", "source", "CDATA", "attachment");
+ }
+ extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification)spec, attributes);
+ }
+ }
+
private void processEmbeddedDocNames(Map<String, PDComplexFileSpecification> embeddedFileNames)
throws IOException, SAXException, TikaException {
if (embeddedFileNames == null || embeddedFileNames.isEmpty()) {
@@ -164,27 +219,30 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
- PDComplexFileSpecification spec = ent.getValue();
- extractMultiOSPDEmbeddedFiles(ent.getKey(), spec);
+ processDoc(ent.getKey(), ent.getValue(), new AttributesImpl());
}
}
private void extractMultiOSPDEmbeddedFiles(String displayName,
- PDComplexFileSpecification spec) throws IOException,
+ PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException,
SAXException, TikaException {
if (spec == null) {
return;
}
//current strategy is to pull all, not just first non-null
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile());
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac());
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos());
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix());
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFile(), spec.getEmbeddedFile(), attributes);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFileMac(), spec.getEmbeddedFileMac(), attributes);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFileDos(), spec.getEmbeddedFileDos(), attributes);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes);
}
private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
- String fileName, PDEmbeddedFile file)
+ String fileName, PDEmbeddedFile file, AttributesImpl attributes)
throws SAXException, IOException, TikaException {
if (file == null) {
@@ -202,25 +260,15 @@ class AbstractPDF2XHTML extends PDFTextStripper {
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
-
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
TikaInputStream stream = null;
try {
-
- InputStream rawStream = null;
- try {
- rawStream = file.createInputStream();
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordException(e, metadata);
- return;
- }
- stream = TikaInputStream.get(rawStream);
+ stream = TikaInputStream.get(file.createInputStream());
embeddedDocumentExtractor.parseEmbedded(
stream,
new EmbeddedContentHandler(xhtml),
embeddedMetadata, false);
- AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", fileName);
xhtml.startElement("div", attributes);
@@ -239,6 +287,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
// WriteOutContentHandler.WriteLimitReachedException?
throw e;
}
+
String msg = e.getMessage();
if (msg == null) {
msg = "IOException, no message";
@@ -295,7 +344,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
try {
- extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "source", "source", "CDATA", "annotation");
+ extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
} catch (SAXException e) {
throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
} catch (TikaException e) {
@@ -303,6 +354,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
} catch (IOException e) {
handleCatchableIOE(e);
}
+ } else if (annotation instanceof PDAnnotationWidget) {
+ handleWidget((PDAnnotationWidget)annotation);
}
// TODO: remove once PDFBOX-1143 is fixed:
if (config.getExtractAnnotationText()) {
@@ -311,11 +364,15 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (annotationlink.getAction() != null) {
PDAction action = annotationlink.getAction();
if (action instanceof PDActionURI) {
+ //can't currently associate link to text.
+ //for now, extract link and repeat the link as if it
+ //were the visible text
PDActionURI uri = (PDActionURI) action;
String link = uri.getURI();
- if (link != null) {
+ if (link != null && link.trim().length() > 0) {
xhtml.startElement("div", "class", "annotation");
xhtml.startElement("a", "href", link);
+ xhtml.characters(link);
xhtml.endElement("a");
xhtml.endElement("div");
}
@@ -358,6 +415,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
doOCROnCurrentPage();
}
+
+ PDPageAdditionalActions pageActions = page.getActions();
+ if (pageActions != null) {
+ handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
+ handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
+ }
xhtml.endElement("div");
} catch (SAXException|TikaException e) {
throw new IOExceptionWithCause("Unable to end a page", e);
@@ -368,15 +431,93 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ private void handleWidget(PDAnnotationWidget widget) throws TikaException, SAXException, IOException {
+ if (widget == null) {
+ return;
+ }
+ handleDestinationOrAction(widget.getAction(), ActionTrigger.ANNOTATION_WIDGET);
+ PDAnnotationAdditionalActions annotationActions = widget.getActions();
+ if (annotationActions != null) {
+ handleDestinationOrAction(annotationActions.getBl(), ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
+ handleDestinationOrAction(annotationActions.getD(), ActionTrigger.ANNOTATION_MOUSE_CLICK);
+ handleDestinationOrAction(annotationActions.getE(), ActionTrigger.ANNOTATION_CURSOR_ENTERS);
+ handleDestinationOrAction(annotationActions.getFo(), ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
+ handleDestinationOrAction(annotationActions.getPC(), ActionTrigger.ANNOTATION_PAGE_CLOSED);
+ handleDestinationOrAction(annotationActions.getPI(), ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
+ handleDestinationOrAction(annotationActions.getPO(), ActionTrigger.ANNOTATION_PAGE_OPENED);
+ handleDestinationOrAction(annotationActions.getPV(), ActionTrigger.ANNOTATION_PAGE_VISIBLE);
+ handleDestinationOrAction(annotationActions.getU(), ActionTrigger.ANNOTATION_MOUSE_RELEASED);
+ handleDestinationOrAction(annotationActions.getX(), ActionTrigger.ANNOTATION_CURSOR_EXIT);
+ }
+
+ }
+
@Override
protected void startDocument(PDDocument pdf) throws IOException {
try {
xhtml.startDocument();
- } catch (SAXException e) {
+ handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN);
+ } catch (TikaException|SAXException e) {
throw new IOExceptionWithCause("Unable to start a document", e);
}
}
+ private void handleDestinationOrAction(PDDestinationOrAction action,
+ ActionTrigger actionTrigger) throws IOException, SAXException, TikaException {
+ if (action == null || ! config.getExtractActions()) {
+ return;
+ }
+ AttributesImpl attributes = new AttributesImpl();
+ String actionOrDestString = (action instanceof PDAction) ? "action" : "destination";
+
+ addNonNullAttribute("class", actionOrDestString, attributes);
+ addNonNullAttribute("type", action.getClass().getSimpleName(), attributes);
+ addNonNullAttribute("trigger", actionTrigger.name(), attributes);
+
+ if (action instanceof PDActionImportData) {
+ processDoc("", ((PDActionImportData)action).getFile(), attributes);
+ } else if (action instanceof PDActionLaunch) {
+ PDActionLaunch pdActionLaunch = (PDActionLaunch)action;
+ addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
+ addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes);
+ addNonNullAttribute("operation", pdActionLaunch.getO(), attributes);
+ addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes);
+ processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes);
+ } else if (action instanceof PDActionRemoteGoTo) {
+ PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo)action;
+ processDoc("", remoteGoTo.getFile(), attributes);
+ } else if (action instanceof PDActionJavaScript) {
+ PDActionJavaScript jsAction = (PDActionJavaScript)action;
+ Metadata m = new Metadata();
+ m.set(Metadata.CONTENT_TYPE, "application/javascript");
+ m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString());
+ m.set(PDF.ACTION_TRIGGER, actionTrigger.toString());
+ m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name());
+ String js = jsAction.getAction();
+ js = (js == null) ? "" : js;
+ if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
+ try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
+ embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, false);
+ }
+ }
+ addNonNullAttribute("class", "javascript", attributes);
+ addNonNullAttribute("type", jsAction.getType(), attributes);
+ addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ } else {
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ }
+ }
+
+ private static void addNonNullAttribute(String name, String value, AttributesImpl attributes) {
+ if (name == null || value == null) {
+ return;
+ }
+ attributes.addAttribute("", name, name, "CDATA", value);
+ }
+
@Override
protected void endDocument(PDDocument pdf) throws IOException {
try {
@@ -396,6 +537,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
handleCatchableIOE(e);
}
}
+ PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions();
+ handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT);
+ handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE);
+ handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE);
+ handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT);
+ handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE);
xhtml.endDocument();
} catch (TikaException e) {
throw new IOExceptionWithCause("Unable to end a document", e);
@@ -404,21 +551,23 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
- void extractBookmarkText() throws SAXException {
+ void extractBookmarkText() throws SAXException, IOException, TikaException {
PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
if (outline != null) {
extractBookmarkText(outline);
}
}
- void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
+ void extractBookmarkText(PDOutlineNode bookmark) throws SAXException, IOException, TikaException {
PDOutlineItem current = bookmark.getFirstChild();
+
if (current != null) {
xhtml.startElement("ul");
while (current != null) {
xhtml.startElement("li");
xhtml.characters(current.getTitle());
xhtml.endElement("li");
+ handleDestinationOrAction(current.getAction(), ActionTrigger.BOOKMARK);
// Recurse:
extractBookmarkText(current);
current = current.getNextSibling();
@@ -428,7 +577,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
void extractAcroForm(PDDocument pdf) throws IOException,
- SAXException {
+ SAXException, TikaException {
//Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
//this code derives from Ben's code
PDDocumentCatalog catalog = pdf.getDocumentCatalog();
@@ -484,11 +633,26 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
private void processAcroField(PDField field, final int currentRecursiveDepth)
- throws SAXException, IOException {
+ throws SAXException, IOException, TikaException {
if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
return;
}
+
+ PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions();
+ if (pdFormFieldAdditionalActions != null) {
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getC(), ActionTrigger.FORM_FIELD_RECALCULATE);
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getF(), ActionTrigger.FORM_FIELD_FORMATTED);
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getK(), ActionTrigger.FORM_FIELD_KEYSTROKE);
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getV(), ActionTrigger.FORM_FIELD_VALUE_CHANGE);
+ }
+ if (field.getWidgets() != null) {
+ for (PDAnnotationWidget widget : field.getWidgets()) {
+ handleWidget(widget);
+ }
+ }
+
+
addFieldString(field);
if (field instanceof PDNonTerminalField) {
int r = currentRecursiveDepth + 1;
http://git-wip-us.apache.org/repos/asf/tika/blob/300100fc/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index cf43864..cf6a373 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -115,6 +115,9 @@ public class PDFParserConfig implements Serializable {
//and then throws the first stored exception after the parse has completed.
private boolean catchIntermediateIOExceptions = true;
+ private boolean extractActions = false;
+
+
public PDFParserConfig() {
init(this.getClass().getResourceAsStream("PDFParser.properties"));
}
@@ -178,6 +181,8 @@ public class PDFParserConfig implements Serializable {
getBooleanProp(props.getProperty("catchIntermediateIOExceptions"),
isCatchIntermediateIOExceptions()));
+ setExtractActions(getBooleanProp(props.getProperty("extractActions"), false));
+
setOCRStrategy(OCR_STRATEGY.parse(props.getProperty("ocrStrategy")));
setOCRDPI(getIntProp(props.getProperty("ocrDPI"), getOCRDPI()));
@@ -551,6 +556,25 @@ public class PDFParserConfig implements Serializable {
return null;
}
+ /**
+ * Whether or not to extract PDActions from the file.
+ * Most Action types are handled inline; javascript macros
+ * are processed as embedded documents.
+ *
+ * @param v
+ */
+ public void setExtractActions(boolean v) {
+ extractActions = v;
+ }
+
+ /**
+ * @see #setExtractActions(boolean)
+ * @return whether or not to extract PDActions
+ */
+ public boolean getExtractActions() {
+ return extractActions;
+ }
+
@Override
public boolean equals(Object o) {
if (this == o) return true;
@@ -573,6 +597,8 @@ public class PDFParserConfig implements Serializable {
if (!getOCRStrategy().equals(config.getOCRStrategy())) return false;
if (getOCRImageType() != config.getOCRImageType()) return false;
if (!getOCRImageFormatName().equals(config.getOCRImageFormatName())) return false;
+ if (getExtractActions() != config.getExtractActions()) return false;
+
return getAccessChecker().equals(config.getAccessChecker());
}
@@ -594,7 +620,8 @@ public class PDFParserConfig implements Serializable {
result = 31 * result + getOCRImageType().hashCode();
result = 31 * result + getOCRImageFormatName().hashCode();
result = 31 * result + getAccessChecker().hashCode();
- result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
+ result = 31 * result + (getCatchIntermediateIOExceptions() ? 1 : 0);
+ result = 31 * result + (getExtractActions() ? 1 : 0);
return result;
}
@@ -616,7 +643,8 @@ public class PDFParserConfig implements Serializable {
", ocrImageType=" + ocrImageType +
", ocrImageFormatName='" + ocrImageFormatName + '\'' +
", accessChecker=" + accessChecker +
- ", isCatchIntermediateIOExceptions=" + catchIntermediateIOExceptions +
+ ", extractActions=" + extractActions +
+ ", catchIntermediateIOExceptions=" + catchIntermediateIOExceptions +
'}';
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/300100fc/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 2292157..f76aea7 100644
--- a/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parser-modules/tika-parser-multimedia-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -394,7 +394,8 @@ public class PDFParserTest extends TikaTest {
@Test
public void testLinks() throws Exception {
final XMLResult result = getXML("testPDFVarious.pdf");
- assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\" /></div>", result.xml);
+ assertContains("<div class=\"annotation\"><a href=\"http://tika.apache.org/\">"+
+ "http://tika.apache.org/</a></div>", result.xml);
}
@Test
@@ -893,13 +894,14 @@ public class PDFParserTest extends TikaTest {
XMLResult r = getXML("testPDF_childAttachments.pdf", context);
//regular attachment
- assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", r.xml);
+ assertContains("<div source=\"attachment\" class=\"embedded\" id=\"Unit10.doc\" />", r.xml);
//inline image
assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", r.xml);
//doc embedded inside an annotation
r = getXML("testPDFFileEmbInAnnotation.pdf");
- assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", r.xml);
+ assertContains("<div source=\"annotation\" class=\"embedded\" id=\"Excel.xlsx\" />", r.xml);
+
}
//Access checker tests