You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/02/03 21:11:11 UTC
svn commit: r1564042 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pdf/ test/java/org/apache/tika/
test/java/org/apache/tika/parser/pdf/ test/resources/test-documents/
Author: tallison
Date: Mon Feb 3 20:11:10 2014
New Revision: 1564042
URL: http://svn.apache.org/r1564042
Log:
TIKA-1228: Look for attachments under Kids node if embeddedFiles.getNames() returns null
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1564042&r1=1564041&r2=1564042&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Mon Feb 3 20:11:10 2014
@@ -32,6 +32,7 @@ import org.apache.pdfbox.pdmodel.PDDocum
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.COSObjectable;
+import org.apache.pdfbox.pdmodel.common.PDNameTreeNode;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
@@ -346,48 +347,73 @@ class PDF2XHTML extends PDFTextStripper
}
private void extractEmbeddedDocuments(PDDocument document, ContentHandler handler)
- throws IOException, SAXException, TikaException {
- PDDocumentCatalog catalog = document.getDocumentCatalog();
- PDDocumentNameDictionary names = catalog.getNames();
- if (names != null) {
-
- PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
- if (embeddedFiles != null) {
-
- EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
- if (embeddedExtractor == null) {
- embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
- }
-
- Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
-
- if (embeddedFileNames != null) {
- for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) {
- PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
- PDEmbeddedFile file = spec.getEmbeddedFile();
-
- Metadata metadata = new Metadata();
- // TODO: other metadata?
- metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
- metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
- metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
-
- if (embeddedExtractor.shouldParseEmbedded(metadata)) {
- TikaInputStream stream = TikaInputStream.get(file.createInputStream());
- try {
- embeddedExtractor.parseEmbedded(
- stream,
- new EmbeddedContentHandler(handler),
- metadata, false);
- } finally {
- stream.close();
- }
- }
- }
- }
- }
- }
- }
+ throws IOException, SAXException, TikaException {
+ PDDocumentCatalog catalog = document.getDocumentCatalog();
+ PDDocumentNameDictionary names = catalog.getNames();
+ if (names == null){
+ return;
+ }
+ PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
+
+ if (embeddedFiles == null) {
+ return;
+ }
+
+ EmbeddedDocumentExtractor embeddedExtractor = context.get(EmbeddedDocumentExtractor.class);
+ if (embeddedExtractor == null) {
+ embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+ }
+
+ Map<String, COSObjectable> embeddedFileNames = embeddedFiles.getNames();
+ //For now, try to get the embeddedFileNames out of embeddedFiles or its kids.
+ //This code follows: pdfbox/examples/pdmodel/ExtractEmbeddedFiles.java
+ //If there is a need we could add a fully recursive search to find a non-null
+ //Map<String, COSObjectable> that contains the doc info.
+ if (embeddedFileNames != null){
+ processEmbeddedDocNames(embeddedFileNames, embeddedExtractor);
+ } else {
+ List<PDNameTreeNode> kids = embeddedFiles.getKids();
+ if (kids == null){
+ return;
+ }
+ for (PDNameTreeNode n : kids){
+ Map<String, COSObjectable> childNames = n.getNames();
+ if (childNames != null){
+ processEmbeddedDocNames(childNames, embeddedExtractor);
+ }
+ }
+ }
+ }
+
+
+ private void processEmbeddedDocNames(Map<String, COSObjectable> embeddedFileNames,
+ EmbeddedDocumentExtractor embeddedExtractor) throws IOException, SAXException, TikaException {
+ if (embeddedFileNames == null){
+ return;
+ }
+ for (Map.Entry<String,COSObjectable> ent : embeddedFileNames.entrySet()) {
+ PDComplexFileSpecification spec = (PDComplexFileSpecification) ent.getValue();
+ PDEmbeddedFile file = spec.getEmbeddedFile();
+
+ Metadata metadata = new Metadata();
+ // TODO: other metadata?
+ metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
+ metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ TikaInputStream stream = TikaInputStream.get(file.createInputStream());
+ try {
+ embeddedExtractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(handler),
+ metadata, false);
+ } finally {
+ stream.close();
+ }
+ }
+ }
+ }
private void extractAcroForm(PDDocument pdf, XHTMLContentHandler handler) throws IOException,
SAXException {
//Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1564042&r1=1564041&r2=1564042&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Mon Feb 3 20:11:10 2014
@@ -107,7 +107,7 @@ public abstract class TikaTest {
* Tries to close input stream after processing.
*/
public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata) throws Exception{
- ContentHandler handler = new BodyContentHandler();
+ ContentHandler handler = new BodyContentHandler(1000000);
try {
parser.parse(is, handler, metadata, context);
} finally {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1564042&r1=1564041&r2=1564042&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Mon Feb 3 20:11:10 2014
@@ -48,10 +48,11 @@ import org.xml.sax.ContentHandler;
*/
public class PDFParserTest extends TikaTest {
+ public static final MediaType TYPE_TEXT = MediaType.TEXT_PLAIN;
public static final MediaType TYPE_EMF = MediaType.application("x-emf");
public static final MediaType TYPE_PDF = MediaType.application("pdf");
public static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
-
+ public static final MediaType TYPE_DOC = MediaType.application("msword");
@Test
public void testPdfParsing() throws Exception {
@@ -564,7 +565,7 @@ public class PDFParserTest extends TikaT
//make sure nothing went wrong with getting the resource to test-documents
//This will require modification with each new pdf test.
//If this is too annoying, we can turn it off.
- assertEquals("Number of pdf files tested", 15, pdfs);
+ assertEquals("Number of pdf files tested", 16, pdfs);
}
@@ -625,4 +626,30 @@ public class PDFParserTest extends TikaT
String xml = getXML("/testPDF_acroform3.pdf").xml;
assertTrue("found", (xml.indexOf("<li>aTextField: TIKA-1226</li>") > -1));
}
+
+ //TIKA-1228
+ public void testEmbeddedFilesInChildren() throws Exception {
+ String xml = getXML("/testPDF_childAttachments.pdf").xml;
+ //"regressiveness" exists only in Unit10.doc not in the container pdf document
+ assertTrue(xml.contains("regressiveness"));
+
+ TrackingHandler tracker = new TrackingHandler();
+ TikaInputStream tis = null;
+ ContainerExtractor ex = new ParserContainerExtractor();
+ try{
+ tis= TikaInputStream.get(
+ getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"));
+ ex.extract(tis, ex, tracker);
+ } finally {
+ if (tis != null){
+ tis.close();
+ }
+ }
+ assertEquals(2, tracker.filenames.size());
+ assertEquals(2, tracker.mediaTypes.size());
+ assertEquals("Press Quality(1).joboptions", tracker.filenames.get(0));
+ assertEquals("Unit10.doc", tracker.filenames.get(1));
+ assertEquals(TYPE_TEXT, tracker.mediaTypes.get(0));
+ assertEquals(TYPE_DOC, tracker.mediaTypes.get(1));
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf?rev=1564042&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_childAttachments.pdf Mon Feb 3 20:11:10 2014 differ