You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2013/08/08 19:55:28 UTC
svn commit: r1511901 - in /tika/trunk: ./
tika-parsers/src/main/java/org/apache/tika/parser/pdf/
tika-parsers/src/test/java/org/apache/tika/parser/pdf/
tika-parsers/src/test/resources/test-documents/
Author: tallison
Date: Thu Aug 8 17:55:27 2013
New Revision: 1511901
URL: http://svn.apache.org/r1511901
Log:
TIKA-1124, process attachments within an embedded PDF
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/TIKA-1142.docx (with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1511901&r1=1511900&r2=1511901&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Aug 8 17:55:27 2013
@@ -2,6 +2,9 @@ Release 1.5 - Current Development
* Added sanitized test HTML file for local file test (Tika-1139).
+ * Fixed bug that prevented attachments within a PDF from being processed
+ if the PDF itself was an attachment (TIKA-1124).
+
Release 1.4 - 06/15/2013
* Removed a test HTML file with a poorly chosen GPL text in it (TIKA-1129).
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1511901&r1=1511900&r2=1511901&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Thu Aug 8 17:55:27 2013
@@ -18,9 +18,16 @@ package org.apache.tika.parser.pdf;
import java.io.IOException;
import java.io.Writer;
+import java.util.Map;
import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
+import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
+import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.COSObjectable;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
+import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
@@ -31,8 +38,13 @@ import org.apache.pdfbox.pdmodel.interac
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -58,7 +70,7 @@ class PDF2XHTML extends PDFTextStripper
* @throws TikaException if the PDF document can not be processed
*/
public static void process(
- PDDocument document, ContentHandler handler, Metadata metadata,
+ PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
boolean extractAnnotationText, boolean enableAutoSpace,
boolean suppressDuplicateOverlappingText, boolean sortByPosition)
throws SAXException, TikaException {
@@ -66,7 +78,7 @@ class PDF2XHTML extends PDFTextStripper
// Extract text using a dummy Writer as we override the
// key methods to output to the given content
// handler.
- PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, metadata,
+ PDF2XHTML pdf2XHTML = new PDF2XHTML(handler, context, metadata,
extractAnnotationText, enableAutoSpace,
suppressDuplicateOverlappingText, sortByPosition);
pdf2XHTML.writeText(document, new Writer() {
@@ -89,14 +101,18 @@ class PDF2XHTML extends PDFTextStripper
}
}
}
-
+
+ private final ContentHandler originalHandler;
+ private final ParseContext context;
private final XHTMLContentHandler handler;
private final boolean extractAnnotationText;
- private PDF2XHTML(ContentHandler handler, Metadata metadata,
+ private PDF2XHTML(ContentHandler handler, ParseContext context, Metadata metadata,
boolean extractAnnotationText, boolean enableAutoSpace,
boolean suppressDuplicateOverlappingText, boolean sortByPosition)
throws IOException {
+ this.originalHandler = handler;
+ this.context = context;
this.handler = new XHTMLContentHandler(handler, metadata);
this.extractAnnotationText = extractAnnotationText;
setForceParsing(true);
@@ -149,7 +165,10 @@ class PDF2XHTML extends PDFTextStripper
try {
// Extract text for any bookmarks:
extractBookmarkText();
+ extractEmbeddedDocuments(pdf, originalHandler);
handler.endDocument();
+ } catch (TikaException e){
+ throw new IOExceptionWithCause("Unable to end a document", e);
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to end a document", e);
}
@@ -298,4 +317,48 @@ class PDF2XHTML extends PDFTextStripper
"Unable to write a newline character", e);
}
}
+
+ private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler)
+ throws IOException, SAXException, TikaException {
+ PDDocumentCatalog catalog = document.getDocumentCatalog();
+ PDDocumentNameDictionary names = catalog.getNames();
+ if (names != null) {
+
+ PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
+ if (embeddedFiles != null) {
+
+ EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
+ if (embeddedExtractor == null) {
+ embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+ }
+
+ Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
+
+ if (embeddedFileNames != null) {
+ for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) {
+ PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
+ PDEmbeddedFile file = spec.getEmbeddedFile();
+
+ Metadata metadata = new Metadata();
+ // TODO: other metadata?
+ metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
+ metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ TikaInputStream stream = TikaInputStream.get(file.createInputStream());
+ try {
+ embeddedExtractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(handler),
+ metadata, false);
+ } finally {
+ stream.close();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java?rev=1511901&r1=1511900&r2=1511901&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParser.java Thu Aug 8 17:55:27 2013
@@ -22,7 +22,6 @@ import java.util.Arrays;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
-import java.util.Map;
import java.util.Set;
import org.apache.pdfbox.cos.COSArray;
@@ -32,16 +31,9 @@ import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.RandomAccess;
import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
-import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
-import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
-import org.apache.pdfbox.pdmodel.common.COSObjectable;
-import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
-import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
-import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -53,7 +45,6 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.sax.EmbeddedContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -151,62 +142,20 @@ public class PDFParser extends AbstractP
}
metadata.set(Metadata.CONTENT_TYPE, "application/pdf");
extractMetadata(pdfDocument, metadata);
- PDF2XHTML.process(pdfDocument, handler, metadata,
+ PDF2XHTML.process(pdfDocument, handler, context, metadata,
extractAnnotationText, enableAutoSpace,
suppressDuplicateOverlappingText, sortByPosition);
-
- extractEmbeddedDocuments(context, pdfDocument, handler);
+
} finally {
if (pdfDocument != null) {
pdfDocument.close();
}
tmp.dispose();
}
+ handler.endDocument();
}
- private void extractEmbeddedDocuments(ParseContext context, PDDocument document, ContentHandler handler)
- throws IOException, SAXException, TikaException {
- PDDocumentCatalog catalog = document.getDocumentCatalog();
- PDDocumentNameDictionary names = catalog.getNames();
- if (names != null) {
-
- PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
- if (embeddedFiles != null) {
-
- EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
- if (embeddedExtractor == null) {
- embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
- }
-
- Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
-
- if (embeddedFileNames != null) {
- for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) {
- PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
- PDEmbeddedFile file = spec.getEmbeddedFile();
-
- Metadata metadata = new Metadata();
- // TODO: other metadata?
- metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
- metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
- metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
-
- if (embeddedExtractor.shouldParseEmbedded(metadata)) {
- TikaInputStream stream = TikaInputStream.get(file.createInputStream());
- try {
- embeddedExtractor.parseEmbedded(
- stream,
- new EmbeddedContentHandler(handler),
- metadata, false);
- } finally {
- stream.close();
- }
- }
- }
- }
- }
- }
- }
+
private void extractMetadata(PDDocument document, Metadata metadata)
throws TikaException {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1511901&r1=1511900&r2=1511901&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Thu Aug 8 17:55:27 2013
@@ -19,13 +19,18 @@ package org.apache.tika.parser.pdf;
import java.io.InputStream;
import org.apache.tika.TikaTest;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest.TrackingHandler;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
/**
@@ -33,6 +38,11 @@ import org.xml.sax.ContentHandler;
*/
public class PDFParserTest extends TikaTest {
+ public static final MediaType TYPE_EMF = MediaType.application("x-emf");
+ public static final MediaType TYPE_PDF = MediaType.application("pdf");
+ public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+
+
public void testPdfParsing() throws Exception {
Parser parser = new AutoDetectParser(); // Should auto-detect!
ContentHandler handler = new BodyContentHandler();
@@ -455,4 +465,55 @@ public class PDFParserTest extends TikaT
assertTrue(j != -1);
assertTrue(i < j);
}
+
+ //TIKA-1142
+ public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
+ /* format of test doc:
+ docx/
+ pdf/
+ docx
+ */
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ String content = "";
+ InputStream stream = null;
+ try{
+ context.set(org.apache.tika.parser.Parser.class, parser);
+ stream = getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx");
+ parser.parse(stream, handler, metadata, context);
+ content = handler.toString();
+ } finally {
+ stream.close();
+ }
+ int outerHaystack = content.indexOf("Outer_haystack");
+ int pdfHaystack = content.indexOf("pdf_haystack");
+ int needle = content.indexOf("Needle");
+ assertTrue(outerHaystack > -1);
+ assertTrue(pdfHaystack > -1);
+ assertTrue(needle > -1);
+ assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
+
+ //plagiarized from POIContainerExtractionTest. Thank you!
+ TrackingHandler tracker = new TrackingHandler();
+ TikaInputStream tis;
+ ContainerExtractor ex = new ParserContainerExtractor();
+ try{
+ tis= TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"));
+ ex.extract(tis, ex, tracker);
+ } finally {
+ stream.close();
+ }
+ assertEquals(true, ex.isSupported(tis));
+ assertEquals(3, tracker.filenames.size());
+ assertEquals(3, tracker.mediaTypes.size());
+ assertEquals("image1.emf", tracker.filenames.get(0));
+ assertNull(tracker.filenames.get(1));
+ assertEquals("My first attachment", tracker.filenames.get(2));
+ assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
+ assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
+ assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
+ }
+
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/TIKA-1142.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/TIKA-1142.docx?rev=1511901&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/TIKA-1142.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream