You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2012/05/22 15:16:27 UTC

svn commit: r1341463 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pdf/PDFParser.java test/java/org/apache/tika/parser/pdf/PDFParserTest.java test/resources/test-documents/testPDFPackage.pdf

Author: jukka
Date: Tue May 22 13:16:26 2012
New Revision: 1341463

URL: http://svn.apache.org/viewvc?rev=1341463&view=rev
Log:
TIKA-931: Tika's PDFParser fails to parse documents embedded in a PDF Package

Copy changes from PDFBox. Original patch by Michael McCandless.

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFPackage.pdf   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1341463&r1=1341462&r2=1341463&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Tue May 22 13:16:26 2012
@@ -22,6 +22,7 @@ import java.util.Arrays;
 import java.util.Calendar;
 import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.pdfbox.cos.COSArray;
@@ -31,8 +32,15 @@ import org.apache.pdfbox.cos.COSString;
 import org.apache.pdfbox.io.RandomAccess;
 import org.apache.pdfbox.io.RandomAccessFile;
 import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
@@ -44,6 +52,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.EmbeddedContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -53,7 +62,10 @@ import org.xml.sax.SAXException;
  * This parser can process also encrypted PDF documents if the required
  * password is given as a part of the input metadata associated with a
  * document. If no password is given, then this parser will try decrypting
- * the document using the empty password that's often used with PDFs.
+ * the document using the empty password that's often used with PDFs. If
+ * the PDF contains any embedded documents (for example as part of a PDF
+ * package) then this parser will use the {@link EmbeddedDocumentExtractor}
+ * to handle them.
  */
 public class PDFParser extends AbstractParser {
 
@@ -141,6 +153,8 @@ public class PDFParser extends AbstractP
             PDF2XHTML.process(pdfDocument, handler, metadata,
                               extractAnnotationText, enableAutoSpace,
                               suppressDuplicateOverlappingText, sortByPosition);
+
+            extractEmbeddedDocuments(context, pdfDocument, handler);
         } finally {
             if (pdfDocument != null) {
                pdfDocument.close();
@@ -149,6 +163,46 @@ public class PDFParser extends AbstractP
         }
     }
 
+    private void extractEmbeddedDocuments(ParseContext context, PDDocument document, ContentHandler handler)
+            throws IOException, SAXException, TikaException {
+        PDDocumentCatalog catalog = document.getDocumentCatalog();
+        PDDocumentNameDictionary names = catalog.getNames();
+        if (names != null) {
+
+            PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
+            if (embeddedFiles != null) {
+
+                EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
+                if (embeddedExtractor == null) {
+                    embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+                }
+
+                for (Map.Entry<String,Object> ent : embeddedFiles.getNames().entrySet()) {
+                    PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
+                    PDEmbeddedFile file = spec.getEmbeddedFile();
+
+                    Metadata metadata = new Metadata();
+                    // TODO: other metadata?
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
+                    metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+                    metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+
+                    if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+                        TikaInputStream stream = TikaInputStream.get(file.createInputStream());
+                        try {
+                            embeddedExtractor.parseEmbedded(
+                                    stream,
+                                    new EmbeddedContentHandler(handler),
+                                    metadata, false);
+                        } finally {
+                            stream.close();
+                        }
+                    }
+                }
+            }
+        }
+    }
+
     private void extractMetadata(PDDocument document, Metadata metadata)
             throws TikaException {
         PDDocumentInformation info = document.getDocumentInformation();

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1341463&r1=1341462&r2=1341463&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Tue May 22 13:16:26 2012
@@ -288,6 +288,12 @@ public class PDFParserTest extends TikaT
                      substringCount("</p>", xml));
     }
 
+    public void testEmbeddedPDFs() throws Exception {
+        String xml = getXML("testPDFPackage.pdf").xml;
+        assertContains("PDF1", xml);
+        assertContains("PDF2", xml);
+    }
+
     private static int substringCount(String needle, String haystack) {
         int upto = -1;
         int count = 0;
@@ -441,10 +447,12 @@ public class PDFParserTest extends TikaT
         handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
         handler.setResult(new StreamResult(sw));
 
+        ParseContext context = new ParseContext();
+        context.set(Parser.class, parser);
         // Try with a document containing various tables and formattings
         InputStream input = getResourceAsStream("/test-documents/" + filename);
         try {
-            parser.parse(input, handler, metadata, new ParseContext());
+            parser.parse(input, handler, metadata, context);
             return new XMLResult(sw.toString(), metadata);
         } finally {
             input.close();

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFPackage.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFPackage.pdf?rev=1341463&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFPackage.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/pdf