You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/01 00:38:56 UTC
[2/4] tika git commit: TIKA-2090 -- add more areas where javascript
might live and add ability to turn action extraction on/off
TIKA-2090 -- add more areas where javascript might live and add ability to turn action extraction on/off
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/4dd6fd11
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/4dd6fd11
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/4dd6fd11
Branch: refs/heads/master
Commit: 4dd6fd11035c09070689471975cc661aafa77333
Parents: 7fbf0f3
Author: tballison <ta...@mitre.org>
Authored: Fri Oct 28 09:32:24 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Oct 28 09:32:24 2016 -0400
----------------------------------------------------------------------
.../main/java/org/apache/tika/metadata/PDF.java | 4 +-
.../tika/parser/pdf/AbstractPDF2XHTML.java | 192 +++++++++++--------
.../apache/tika/parser/pdf/PDFParserConfig.java | 27 +++
.../apache/tika/parser/pdf/PDFParserTest.java | 4 +-
4 files changed, 148 insertions(+), 79 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/4dd6fd11/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index 28432c1..90b1fc0 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -67,7 +67,7 @@ public interface PDF {
/**
* This specifies where an action or destination would be found/triggered
- * in the document: on document open, close, etc.
+ * in the document: on document open, before close, etc.
*/
- Property ACTION_LOCATION = Property.internalText(PDF_PREFIX+"actionLocation");
+ Property ACTION_TRIGGER = Property.internalText(PDF_PREFIX+"actionTrigger");
}
http://git-wip-us.apache.org/repos/asf/tika/blob/4dd6fd11/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index b6a7c60..cb7c673 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -28,7 +28,6 @@ import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
-import java.nio.file.StandardOpenOption;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
@@ -40,10 +39,6 @@ import java.util.TreeMap;
import org.apache.commons.io.IOExceptionWithCause;
import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.output.ByteArrayOutputStream;
-import org.apache.pdfbox.cos.COSBase;
-import org.apache.pdfbox.cos.COSInputStream;
-import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
@@ -61,7 +56,6 @@ import org.apache.pdfbox.pdmodel.interactive.action.PDActionJavaScript;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionLaunch;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionRemoteGoTo;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionURI;
-import org.apache.pdfbox.pdmodel.interactive.action.PDAdditionalActions;
import org.apache.pdfbox.pdmodel.interactive.action.PDAnnotationAdditionalActions;
import org.apache.pdfbox.pdmodel.interactive.action.PDDocumentCatalogAdditionalActions;
import org.apache.pdfbox.pdmodel.interactive.action.PDFormFieldAdditionalActions;
@@ -70,11 +64,8 @@ import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationPopup;
-import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget;
import org.apache.pdfbox.pdmodel.interactive.digitalsignature.PDSignature;
-import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
@@ -91,7 +82,6 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Message;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -106,13 +96,31 @@ import org.xml.sax.helpers.AttributesImpl;
class AbstractPDF2XHTML extends PDFTextStripper {
- enum ActionLocation {
- DOCUMENT_OPEN,
+ enum ActionTrigger {
AFTER_DOCUMENT_PRINT,
AFTER_DOCUMENT_SAVE,
+ ANNOTATION_CURSOR_ENTERS,
+ ANNOTATION_CURSOR_EXIT,
+ ANNOTATION_LOSE_INPUT_FOCUS,
+ ANNOTATION_MOUSE_CLICK,
+ ANNOTATION_MOUSE_RELEASED,
+ ANNOTATION_PAGE_CLOSED,
+ ANNOTATION_PAGE_NO_LONGER_VISIBLE,
+ ANNOTATION_PAGE_OPENED,
+ ANNOTATION_PAGE_VISIBLE,
+ ANNOTATION_RECEIVES_FOCUS,
+ ANNOTATION_WIDGET,
BEFORE_DOCUMENT_CLOSE,
BEFORE_DOCUMENT_PRINT,
- BEFORE_DOCUMENT_SAVE, FORM_FIELD, FORM_FIELD_RECALCULATE, FORM_FIELD_FORMATTED, FORM_FIELD_KEYSTROKE, FORM_FIELD_VALUE_CHANGE, PAGE_CLOSE, PAGE_OPEN, ANNOTATION_WIDGET, ANNOTATION_LOSE_INPUT_FOCUS, ANNOTATION_MOUSE_CLICK, ANNOTATION_CURSOR_ENTERS, ANNOTATION_RECEIVES_FOCUS, ANNOTATION_PAGE_CLOSED, ANNOTATION_PAGE_NO_LONGER_VISIBLE, ANNOTATION_PAGE_OPENED, ANNOTATION_PAGE_VISIBLE, ANNOTATION_MOUSE_RELEASED, ANNOTATION_CURSOR_EXIT,
+ BEFORE_DOCUMENT_SAVE,
+ DOCUMENT_OPEN,
+ FORM_FIELD,
+ FORM_FIELD_FORMATTED,
+ FORM_FIELD_KEYSTROKE,
+ FORM_FIELD_RECALCULATE,
+ FORM_FIELD_VALUE_CHANGE,
+ PAGE_CLOSE,
+ PAGE_OPEN, BOOKMARK,
};
/**
@@ -197,15 +205,17 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
- private void processDoc(String name, PDFileSpecification spec) throws TikaException, SAXException, IOException {
+ private void processDoc(String name, PDFileSpecification spec, AttributesImpl attributes) throws TikaException, SAXException, IOException {
if (spec instanceof PDSimpleFileSpecification) {
- AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "linked");
attributes.addAttribute("", "id", "id", "CDATA", spec.getFile());
xhtml.startElement("div", attributes);
xhtml.endElement("div");
} else if (spec instanceof PDComplexFileSpecification){
- extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification)spec);
+ if (attributes.getIndex("source") < 0) {
+ attributes.addAttribute("", "source", "source", "CDATA", "attachment");
+ }
+ extractMultiOSPDEmbeddedFiles(name, (PDComplexFileSpecification)spec, attributes);
}
}
@@ -216,12 +226,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet()) {
- processDoc(ent.getKey(), ent.getValue());
+ processDoc(ent.getKey(), ent.getValue(), new AttributesImpl());
}
}
private void extractMultiOSPDEmbeddedFiles(String displayName,
- PDComplexFileSpecification spec) throws IOException,
+ PDComplexFileSpecification spec, AttributesImpl attributes) throws IOException,
SAXException, TikaException {
if (spec == null) {
@@ -229,15 +239,19 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
//current strategy is to pull all, not just first non-null
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(), extractor);
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
- extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFile(), spec.getEmbeddedFile(), extractor, attributes);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFileMac(), spec.getEmbeddedFileMac(), extractor, attributes);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFileDos(), spec.getEmbeddedFileDos(), extractor, attributes);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
+ spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor, attributes);
}
private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
String fileName, PDEmbeddedFile file,
- EmbeddedDocumentExtractor extractor)
+ EmbeddedDocumentExtractor extractor, AttributesImpl attributes)
throws SAXException, IOException, TikaException {
if (file == null) {
@@ -245,7 +259,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return;
}
- fileName = (fileName == null) ? displayName : fileName;
+ fileName = (fileName == null || "".equals(fileName.trim())) ? unicodeFileName : fileName;
+ fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;
// TODO: other metadata?
Metadata metadata = new Metadata();
@@ -255,7 +270,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
-
if (extractor.shouldParseEmbedded(metadata)) {
TikaInputStream stream = null;
try {
@@ -265,7 +279,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
new EmbeddedContentHandler(xhtml),
metadata, false);
- AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", fileName);
xhtml.startElement("div", attributes);
@@ -341,7 +354,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
try {
- extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec);
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "source", "source", "CDATA", "annotation");
+ extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
} catch (SAXException e) {
throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
} catch (TikaException e) {
@@ -350,21 +365,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
handleCatchableIOE(e);
}
} else if (annotation instanceof PDAnnotationWidget) {
- PDAnnotationWidget widget = (PDAnnotationWidget)annotation;
- handleDestinationOrAction(widget.getAction(), ActionLocation.ANNOTATION_WIDGET);
- PDAnnotationAdditionalActions annotationActions = widget.getActions();
- if (annotationActions != null) {
- handleDestinationOrAction(annotationActions.getBl(), ActionLocation.ANNOTATION_LOSE_INPUT_FOCUS);
- handleDestinationOrAction(annotationActions.getD(), ActionLocation.ANNOTATION_MOUSE_CLICK);
- handleDestinationOrAction(annotationActions.getE(), ActionLocation.ANNOTATION_CURSOR_ENTERS);
- handleDestinationOrAction(annotationActions.getFo(), ActionLocation.ANNOTATION_RECEIVES_FOCUS);
- handleDestinationOrAction(annotationActions.getPC(), ActionLocation.ANNOTATION_PAGE_CLOSED);
- handleDestinationOrAction(annotationActions.getPI(), ActionLocation.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
- handleDestinationOrAction(annotationActions.getPO(), ActionLocation.ANNOTATION_PAGE_OPENED);
- handleDestinationOrAction(annotationActions.getPV(), ActionLocation.ANNOTATION_PAGE_VISIBLE);
- handleDestinationOrAction(annotationActions.getU(), ActionLocation.ANNOTATION_MOUSE_RELEASED);
- handleDestinationOrAction(annotationActions.getX(), ActionLocation.ANNOTATION_CURSOR_EXIT);
- }
+ handleWidget((PDAnnotationWidget)annotation);
}
// TODO: remove once PDFBOX-1143 is fixed:
if (config.getExtractAnnotationText()) {
@@ -427,8 +428,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
PDPageAdditionalActions pageActions = page.getActions();
if (pageActions != null) {
- handleDestinationOrAction(pageActions.getC(), ActionLocation.PAGE_CLOSE);
- handleDestinationOrAction(pageActions.getO(), ActionLocation.PAGE_OPEN);
+ handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
+ handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
}
xhtml.endElement("div");
} catch (SAXException|TikaException e) {
@@ -440,53 +441,84 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ private void handleWidget(PDAnnotationWidget widget) throws TikaException, SAXException, IOException {
+ if (widget == null) {
+ return;
+ }
+ handleDestinationOrAction(widget.getAction(), ActionTrigger.ANNOTATION_WIDGET);
+ PDAnnotationAdditionalActions annotationActions = widget.getActions();
+ if (annotationActions != null) {
+ handleDestinationOrAction(annotationActions.getBl(), ActionTrigger.ANNOTATION_LOSE_INPUT_FOCUS);
+ handleDestinationOrAction(annotationActions.getD(), ActionTrigger.ANNOTATION_MOUSE_CLICK);
+ handleDestinationOrAction(annotationActions.getE(), ActionTrigger.ANNOTATION_CURSOR_ENTERS);
+ handleDestinationOrAction(annotationActions.getFo(), ActionTrigger.ANNOTATION_RECEIVES_FOCUS);
+ handleDestinationOrAction(annotationActions.getPC(), ActionTrigger.ANNOTATION_PAGE_CLOSED);
+ handleDestinationOrAction(annotationActions.getPI(), ActionTrigger.ANNOTATION_PAGE_NO_LONGER_VISIBLE);
+ handleDestinationOrAction(annotationActions.getPO(), ActionTrigger.ANNOTATION_PAGE_OPENED);
+ handleDestinationOrAction(annotationActions.getPV(), ActionTrigger.ANNOTATION_PAGE_VISIBLE);
+ handleDestinationOrAction(annotationActions.getU(), ActionTrigger.ANNOTATION_MOUSE_RELEASED);
+ handleDestinationOrAction(annotationActions.getX(), ActionTrigger.ANNOTATION_CURSOR_EXIT);
+ }
+
+ }
+
@Override
protected void startDocument(PDDocument pdf) throws IOException {
try {
xhtml.startDocument();
- handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionLocation.DOCUMENT_OPEN);
+ handleDestinationOrAction(pdf.getDocumentCatalog().getOpenAction(), ActionTrigger.DOCUMENT_OPEN);
} catch (TikaException|SAXException e) {
throw new IOExceptionWithCause("Unable to start a document", e);
}
}
private void handleDestinationOrAction(PDDestinationOrAction action,
- ActionLocation actionLocation) throws IOException, SAXException, TikaException {
- if (action == null) {
+ ActionTrigger actionTrigger) throws IOException, SAXException, TikaException {
+ if (action == null || ! config.getExtractActions()) {
return;
}
+ AttributesImpl attributes = new AttributesImpl();
+ String actionOrDestString = (action instanceof PDAction) ? "action" : "destination";
+
+ addNonNullAttribute("class", actionOrDestString, attributes);
+ addNonNullAttribute("type", action.getClass().getSimpleName(), attributes);
+ addNonNullAttribute("trigger", actionTrigger.name(), attributes);
if (action instanceof PDActionImportData) {
- processDoc("", ((PDActionImportData)action).getFile());
+ processDoc("", ((PDActionImportData)action).getFile(), attributes);
} else if (action instanceof PDActionLaunch) {
PDActionLaunch pdActionLaunch = (PDActionLaunch)action;
- if (pdActionLaunch.getFile() instanceof PDComplexFileSpecification) {
- processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile());
- } else {
- AttributesImpl attributes = new AttributesImpl();
- addNonNullAttribute("class", "launch", attributes);
- addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
- addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes);
- addNonNullAttribute("operation", pdActionLaunch.getO(), attributes);
- addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes);
- xhtml.startElement("div", attributes);
- xhtml.endElement("div");
- }
+ addNonNullAttribute("id", pdActionLaunch.getF(), attributes);
+ addNonNullAttribute("defaultDirectory", pdActionLaunch.getD(), attributes);
+ addNonNullAttribute("operation", pdActionLaunch.getO(), attributes);
+ addNonNullAttribute("parameters", pdActionLaunch.getP(), attributes);
+ processDoc(pdActionLaunch.getF(), pdActionLaunch.getFile(), attributes);
} else if (action instanceof PDActionRemoteGoTo) {
PDActionRemoteGoTo remoteGoTo = (PDActionRemoteGoTo)action;
- processDoc("", remoteGoTo.getFile());
+ processDoc("", remoteGoTo.getFile(), attributes);
} else if (action instanceof PDActionJavaScript) {
PDActionJavaScript jsAction = (PDActionJavaScript)action;
EmbeddedDocumentExtractor ex = getEmbeddedDocumentExtractor();
Metadata m = new Metadata();
m.set(Metadata.CONTENT_TYPE, "application/javascript");
m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString());
- m.set(PDF.ACTION_LOCATION, actionLocation.toString());
+ m.set(PDF.ACTION_TRIGGER, actionTrigger.toString());
+ m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name());
String js = jsAction.getAction();
js = (js == null) ? "" : js;
- try (InputStream is = new ByteArrayInputStream(js.getBytes(StandardCharsets.UTF_8))) {
- ex.parseEmbedded(is, xhtml, metadata, false);
+ if (ex.shouldParseEmbedded(m)) {
+ try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
+ ex.parseEmbedded(is, xhtml, m, false);
+ }
}
+ addNonNullAttribute("class", "javascript", attributes);
+ addNonNullAttribute("type", jsAction.getType(), attributes);
+ addNonNullAttribute("subtype", jsAction.getSubType(), attributes);
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
+ } else {
+ xhtml.startElement("div", attributes);
+ xhtml.endElement("div");
}
}
@@ -517,11 +549,11 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions();
- handleDestinationOrAction(additionalActions.getDP(), ActionLocation.AFTER_DOCUMENT_PRINT);
- handleDestinationOrAction(additionalActions.getDS(), ActionLocation.AFTER_DOCUMENT_SAVE);
- handleDestinationOrAction(additionalActions.getWC(), ActionLocation.BEFORE_DOCUMENT_CLOSE);
- handleDestinationOrAction(additionalActions.getWP(), ActionLocation.BEFORE_DOCUMENT_PRINT);
- handleDestinationOrAction(additionalActions.getWS(), ActionLocation.BEFORE_DOCUMENT_SAVE);
+ handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT);
+ handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE);
+ handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE);
+ handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT);
+ handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE);
xhtml.endDocument();
} catch (TikaException e) {
throw new IOExceptionWithCause("Unable to end a document", e);
@@ -530,21 +562,23 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
- void extractBookmarkText() throws SAXException {
+ void extractBookmarkText() throws SAXException, IOException, TikaException {
PDDocumentOutline outline = document.getDocumentCatalog().getDocumentOutline();
if (outline != null) {
extractBookmarkText(outline);
}
}
- void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
+ void extractBookmarkText(PDOutlineNode bookmark) throws SAXException, IOException, TikaException {
PDOutlineItem current = bookmark.getFirstChild();
+
if (current != null) {
xhtml.startElement("ul");
while (current != null) {
xhtml.startElement("li");
xhtml.characters(current.getTitle());
xhtml.endElement("li");
+ handleDestinationOrAction(current.getAction(), ActionTrigger.BOOKMARK);
// Recurse:
extractBookmarkText(current);
current = current.getNextSibling();
@@ -614,13 +648,21 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
return;
}
+
PDFormFieldAdditionalActions pdFormFieldAdditionalActions = field.getActions();
if (pdFormFieldAdditionalActions != null) {
- handleDestinationOrAction(pdFormFieldAdditionalActions.getC(), ActionLocation.FORM_FIELD_RECALCULATE);
- handleDestinationOrAction(pdFormFieldAdditionalActions.getF(), ActionLocation.FORM_FIELD_FORMATTED);
- handleDestinationOrAction(pdFormFieldAdditionalActions.getK(), ActionLocation.FORM_FIELD_KEYSTROKE);
- handleDestinationOrAction(pdFormFieldAdditionalActions.getV(), ActionLocation.FORM_FIELD_VALUE_CHANGE);
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getC(), ActionTrigger.FORM_FIELD_RECALCULATE);
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getF(), ActionTrigger.FORM_FIELD_FORMATTED);
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getK(), ActionTrigger.FORM_FIELD_KEYSTROKE);
+ handleDestinationOrAction(pdFormFieldAdditionalActions.getV(), ActionTrigger.FORM_FIELD_VALUE_CHANGE);
+ }
+ if (field.getWidgets() != null) {
+ for (PDAnnotationWidget widget : field.getWidgets()) {
+ handleWidget(widget);
+ }
}
+
+
addFieldString(field);
if (field instanceof PDNonTerminalField) {
int r = currentRecursiveDepth + 1;
http://git-wip-us.apache.org/repos/asf/tika/blob/4dd6fd11/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 014ae7f..567a4fc 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -46,6 +46,7 @@ import org.apache.tika.config.Field;
*/
public class PDFParserConfig implements Serializable {
+
public enum OCR_STRATEGY {
NO_OCR,
OCR_ONLY,
@@ -127,6 +128,8 @@ public class PDFParserConfig implements Serializable {
//and then throws the first stored exception after the parse has completed.
private boolean isCatchIntermediateIOExceptions = true;
+ private boolean extractActions = false;
+
public PDFParserConfig() {
init(this.getClass().getResourceAsStream("PDFParser.properties"));
}
@@ -198,6 +201,8 @@ public class PDFParserConfig implements Serializable {
setOcrImageType(parseImageType(props.getProperty("ocrImageType")));
+ setExtractActions(getBooleanProp(props.getProperty("extractActions"), false));
+
boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);
@@ -561,6 +566,25 @@ public class PDFParserConfig implements Serializable {
this.ocrDPI = ocrDPI;
}
+ /**
+ * Whether or not to extract PDActions from the file.
+ * Most Action types are handled inline; javascript macros
+ * are processed as embedded documents.
+ *
+ * @param v
+ */
+ public void setExtractActions(boolean v) {
+ extractActions = v;
+ }
+
+ /**
+ * @see #setExtractActions(boolean)
+ * @return whether or not to extract PDActions
+ */
+ public boolean getExtractActions() {
+ return extractActions;
+ }
+
private ImageType parseImageType(String ocrImageType) {
for (ImageType t : ImageType.values()) {
if (ocrImageType.equalsIgnoreCase(t.toString())) {
@@ -603,6 +627,7 @@ public class PDFParserConfig implements Serializable {
if (!getOcrStrategy().equals(config.getOcrStrategy())) return false;
if (getOcrImageType() != config.getOcrImageType()) return false;
if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) return false;
+ if (getExtractActions() != config.getExtractActions()) return false;
return getAccessChecker().equals(config.getAccessChecker());
}
@@ -625,6 +650,7 @@ public class PDFParserConfig implements Serializable {
result = 31 * result + getOcrImageFormatName().hashCode();
result = 31 * result + getAccessChecker().hashCode();
result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
+ result = 31 * result + (getExtractActions() ? 1 : 0);
return result;
}
@@ -647,6 +673,7 @@ public class PDFParserConfig implements Serializable {
", ocrImageFormatName='" + ocrImageFormatName + '\'' +
", accessChecker=" + accessChecker +
", isCatchIntermediateIOExceptions=" + isCatchIntermediateIOExceptions +
+ ", extractActions=" + extractActions +
'}';
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/4dd6fd11/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 184ccb2..5fa7e4d 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -895,13 +895,13 @@ public class PDFParserTest extends TikaTest {
XMLResult r = getXML("testPDF_childAttachments.pdf", context);
//regular attachment
- assertContains("<div class=\"embedded\" id=\"Unit10.doc\" />", r.xml);
+ assertContains("<div source=\"attachment\" class=\"embedded\" id=\"Unit10.doc\" />", r.xml);
//inline image
assertContains("<img src=\"embedded:image1.tif\" alt=\"image1.tif\" />", r.xml);
//doc embedded inside an annotation
r = getXML("testPDFFileEmbInAnnotation.pdf");
- assertContains("<div class=\"embedded\" id=\"Excel.xlsx\" />", r.xml);
+ assertContains("<div source=\"annotation\" class=\"embedded\" id=\"Excel.xlsx\" />", r.xml);
}
//Access checker tests