You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/12/01 00:38:58 UTC

[4/4] tika git commit: TIKA-2090 -- add ability to extract PDActions from PDF files

TIKA-2090 -- add ability to extract PDActions from PDF files


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/99b59243
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/99b59243
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/99b59243

Branch: refs/heads/master
Commit: 99b59243756d08124497686642d559f31d549543
Parents: 0e0f30d
Author: tballison <ta...@mitre.org>
Authored: Wed Nov 30 19:38:42 2016 -0500
Committer: tballison <ta...@mitre.org>
Committed: Wed Nov 30 19:38:42 2016 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |  3 ++
 .../tika/parser/pdf/AbstractPDF2XHTML.java      | 34 +++++++++-----------
 2 files changed, 18 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/99b59243/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 4d2c573..8a97cd3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.15 - ??
 
+  * Allow extraction of PDActions (including Javascript) from
+    PDFs (TIKA-2090).
+
   * Change default behavior in experimental .docx parser to ignore
     deleted text to align with .doc (TIKA-2187).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/99b59243/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index cd22895..0688e00 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -230,21 +230,19 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         if (spec == null) {
             return;
         }
-        EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
         //current strategy is to pull all, not just first non-null
         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
-                spec.getFile(), spec.getEmbeddedFile(), extractor, attributes);
+                spec.getFile(), spec.getEmbeddedFile(), attributes);
         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
-                spec.getFileMac(), spec.getEmbeddedFileMac(), extractor, attributes);
+                spec.getFileMac(), spec.getEmbeddedFileMac(), attributes);
         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
-                spec.getFileDos(), spec.getEmbeddedFileDos(), extractor, attributes);
+                spec.getFileDos(), spec.getEmbeddedFileDos(), attributes);
         extractPDEmbeddedFile(displayName, spec.getFileUnicode(),
-                spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor, attributes);
+                spec.getFileUnix(), spec.getEmbeddedFileUnix(), attributes);
     }
 
     private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
-                                       String fileName, PDEmbeddedFile file,
-                                       EmbeddedDocumentExtractor extractor, AttributesImpl attributes)
+                                       String fileName, PDEmbeddedFile file, AttributesImpl attributes)
             throws SAXException, IOException, TikaException {
 
         if (file == null) {
@@ -256,18 +254,18 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         fileName = (fileName == null || "".equals(fileName.trim())) ? displayName : fileName;
 
         // TODO: other metadata?
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
-        metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
-        metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
-        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+        Metadata embeddedMetadata = new Metadata();
+        embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+        embeddedMetadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+        embeddedMetadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+        embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
-        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
-        if (extractor.shouldParseEmbedded(metadata)) {
+        embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
+        if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
             TikaInputStream stream = null;
             try {
                 stream = TikaInputStream.get(file.createInputStream());
-                extractor.parseEmbedded(
+                embeddedDocumentExtractor.parseEmbedded(
                         stream,
                         new EmbeddedContentHandler(xhtml),
                         embeddedMetadata, false);
@@ -341,7 +339,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     protected void endPage(PDPage page) throws IOException {
 
         try {
-            EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
             for (PDAnnotation annotation : page.getAnnotations()) {
 
                 if (annotation instanceof PDAnnotationFileAttachment) {
@@ -492,7 +489,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             processDoc("", remoteGoTo.getFile(), attributes);
         } else if (action instanceof PDActionJavaScript) {
             PDActionJavaScript jsAction = (PDActionJavaScript)action;
-            EmbeddedDocumentExtractor ex = getEmbeddedDocumentExtractor();
             Metadata m = new Metadata();
             m.set(Metadata.CONTENT_TYPE, "application/javascript");
             m.set(Metadata.CONTENT_ENCODING, StandardCharsets.UTF_8.toString());
@@ -500,9 +496,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.name());
             String js = jsAction.getAction();
             js = (js == null) ? "" : js;
-            if (ex.shouldParseEmbedded(m)) {
+            if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
                 try (InputStream is = TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
-                    ex.parseEmbedded(is, xhtml, m, false);
+                    embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, false);
                 }
             }
             addNonNullAttribute("class", "javascript", attributes);