You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2012/05/22 15:16:27 UTC
svn commit: r1341463 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pdf/PDFParser.java
test/java/org/apache/tika/parser/pdf/PDFParserTest.java
test/resources/test-documents/testPDFPackage.pdf
Author: jukka
Date: Tue May 22 13:16:26 2012
New Revision: 1341463
URL: http://svn.apache.org/viewvc?rev=1341463&view=rev
Log:
TIKA-931: Tika's PDFParser fails to parse documents embedded in a PDF Package
Copy changes from PDFBox. Original patch by Michael McCandless.
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFPackage.pdf (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1341463&r1=1341462&r2=1341463&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Tue May 22 13:16:26 2012
@@ -22,6 +22,7 @@ import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import org.apache.pdfbox.cos.COSArray;
@@ -31,8 +32,15 @@ import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -44,6 +52,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.sax.EmbeddedContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -53,7 +62,10 @@ import org.xml.sax.SAXException;
* This parser can process also encrypted PDF documents if the required
* password is given as a part of the input metadata associated with a
* document. If no password is given, then this parser will try decrypting
- * the document using the empty password that's often used with PDFs.
+ * the document using the empty password that's often used with PDFs. If
+ * the PDF contains any embedded documents (for example as part of a PDF
+ * package) then this parser will use the {@link EmbeddedDocumentExtractor}
+ * to handle them.
*/
public class PDFParser extends AbstractParser {
@@ -141,6 +153,8 @@ public class PDFParser extends AbstractP
PDF2XHTML.process(pdfDocument, handler, metadata,
extractAnnotationText, enableAutoSpace,
suppressDuplicateOverlappingText, sortByPosition);
+
+ extractEmbeddedDocuments(context, pdfDocument, handler);
} finally {
if (pdfDocument != null) {
pdfDocument.close();
@@ -149,6 +163,46 @@ public class PDFParser extends AbstractP
}
}
+ private void extractEmbeddedDocuments(ParseContext context, PDDocument document, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ PDDocumentCatalog catalog = document.getDocumentCatalog();
+ PDDocumentNameDictionary names = catalog.getNames();
+ if (names != null) {
+
+ PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
+ if (embeddedFiles != null) {
+
+ EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
+ if (embeddedExtractor == null) {
+ embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+ }
+
+ for (Map.Entry<String,Object> ent : embeddedFiles.getNames().entrySet()) {
+ PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
+ PDEmbeddedFile file = spec.getEmbeddedFile();
+
+ Metadata metadata = new Metadata();
+ // TODO: other metadata?
+ metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
+ metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ TikaInputStream stream = TikaInputStream.get(file.createInputStream());
+ try {
+ embeddedExtractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(handler),
+ metadata, false);
+ } finally {
+ stream.close();
+ }
+ }
+ }
+ }
+ }
+ }
+
private void extractMetadata(PDDocument document, Metadata metadata)
throws TikaException {
PDDocumentInformation info = document.getDocumentInformation();
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1341463&r1=1341462&r2=1341463&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Tue May 22 13:16:26 2012
@@ -288,6 +288,12 @@ public class PDFParserTest extends TikaT
substringCount("</p>", xml));
}
+ public void testEmbeddedPDFs() throws Exception {
+ String xml = getXML("testPDFPackage.pdf").xml;
+ assertContains("PDF1", xml);
+ assertContains("PDF2", xml);
+ }
+
private static int substringCount(String needle, String haystack) {
int upto = -1;
int count = 0;
@@ -441,10 +447,12 @@ public class PDFParserTest extends TikaT
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
handler.setResult(new StreamResult(sw));
+ ParseContext context = new ParseContext();
+ context.set(Parser.class, parser);
// Try with a document containing various tables and formattings
InputStream input = getResourceAsStream("/test-documents/" + filename);
try {
- parser.parse(input, handler, metadata, new ParseContext());
+ parser.parse(input, handler, metadata, context);
return new XMLResult(sw.toString(), metadata);
} finally {
input.close();
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFPackage.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFPackage.pdf?rev=1341463&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDFPackage.pdf
------------------------------------------------------------------------------
svn:mime-type = application/pdf