You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/01 00:38:58 UTC
[4/4] tika git commit: TIKA-2090 -- add ability to extract PDActions
from PDF files
TIKA-2090 -- add ability to extract PDActions from PDF files
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/99b59243
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/99b59243
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/99b59243
Branch: refs/heads/master
Commit: 99b59243756d08124497686642d559f31d549543
Parents: 0e0f30d
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 30 19:38:42 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 30 19:38:42 2016 -0500
----------------------------------------------------------------------
CHANGES.txt | 3 ++
.../tika/parser/pdf/AbstractPDF2XHTML.java | 34 +++++++++-----------
2 files changed, 18 insertions(+), 19 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/99b59243/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 4d2c573..8a97cd3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.15 - ??
+ * Allow extraction of PDActions (including Javascript) from
+ PDFs (TIKA-2090).
+
* Change default behavior in experimental .docx parser to ignore
deleted text to align with .doc (TIKA-2187).
http://git-wip-us.apache.org/repos/asf/tika/blob/99b59243/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index cd22895..0688e00 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -230,21 +230,19 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (spec == null) {
return;
}
- EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
//current strategy is to pull all, not just first non-null
extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
- spec.getFile(), spec.getEmbeddedFile(), extractor, attributes);
+ spec.getFile(), spec.getEmbeddedFile(), attributes);
extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
- spec.getFileMac(), spec.getEmbeddedFileMac(), extractor, attributes);
+ spec.getFileMac(), spec.getEmbeddedFileMac(), attributes);
extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
- spec.getFileDos(), spec.getEmbeddedFileDos(), extractor, attributes);
+ spec.getFileDos(), spec.getEmbeddedFileDos(), attributes);
extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
- spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor, attributes);
+ spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes);
}
private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
- String fileName, PDEmbeddedFile file,
- EmbeddedDocumentExtractor extractor, AttributesImpl attributes)
+ String fileName, PDEmbeddedFile file, AttributesImpl attributes)
throws SAXException, IOException, TikaException {
if (file == null) {
@@ -256,18 +254,18 @@ class AbstractPDF2XHTML extends PDFTextStripper {
fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;
// TODO: other metadata?
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
- metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
- metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
- metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ Metadata embeddedMetadata = new Metadata();
+ embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+ embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+ embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+ embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
- metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
- if (extractor.shouldParseEmbedded(metadata)) {
+ embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
+ if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
TikaInputStream stream = null;
try {
stream = TikaInputStream.get(file.createInputStream());
- extractor.parseEmbedded(
+ embeddedDocumentExtractor.parseEmbedded(
stream,
new EmbeddedContentHandler(xhtml),
embeddedMetadata, false);
@@ -341,7 +339,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
protected void endPage(PDPage page) throws IOException {
try {
- EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
for (PDAnnotation annotation : page.getAnnotations()) {
if (annotation instanceof PDAnnotationFileAttachment) {
@@ -492,7 +489,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
processDoc("", remoteGoTo.getFile(), attributes);
} else if (action instanceof PDActionJavaScript) {
PDActionJavaScript jsAction = (PDActionJavaScript)action;
- EmbeddedDocumentExtractor ex = getEmbeddedDocumentExtractor();
Metadata m = new Metadata();
m.set(Metadata.CONTENT_TYPE, "application/javascript");
m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString());
@@ -500,9 +496,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name());
String js = jsAction.getAction();
js = (js == null) ? "" : js;
- if (ex.shouldParseEmbedded(m)) {
+ if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
- ex.parseEmbedded(is, xhtml, m, false);
+ embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, false);
}
}
addNonNullAttribute("class", "javascript", attributes);