You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/07/25 21:10:40 UTC
svn commit: r1613501 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
test/java/org/apache/tika/TikaTest.java
test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Author: tallison
Date: Fri Jul 25 19:10:40 2014
New Revision: 1613501
URL: http://svn.apache.org/r1613501
Log:
TIKA-1374: Try to extract OS-specific embedded files within PDFs
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1613501&r1=1613500&r2=1613501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Fri Jul 25 19:10:40 2014
@@ -62,6 +62,7 @@ import org.apache.tika.exception.TikaExc
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.io.IOUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -482,27 +483,45 @@ class PDF2XHTML extends PDFTextStripper
//skip silently
continue;
}
- Metadata metadata = new Metadata();
- String actualFileName = spec.getFile();
- actualFileName = (actualFileName == null) ? ent.getKey() : actualFileName;
-
- // TODO: other metadata?
- metadata.set(Metadata.RESOURCE_NAME_KEY, actualFileName);
- metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
- metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
- metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
- TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
-
- if (extractor.shouldParseEmbedded(metadata)) {
- TikaInputStream stream = TikaInputStream.get(file.createInputStream());
- try {
- extractor.parseEmbedded(
- stream,
- new EmbeddedContentHandler(handler),
- metadata, false);
- } finally {
- stream.close();
- }
+
+ //current strategy is to pull all, not just first non-null
+ extractPDEmbeddedFile(ent.getKey(), spec.getFile(), spec.getEmbeddedFile(), extractor);
+ extractPDEmbeddedFile(ent.getKey(), spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
+ extractPDEmbeddedFile(ent.getKey(), spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
+ extractPDEmbeddedFile(ent.getKey(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+
+ }
+ }
+
+ private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
+ EmbeddedDocumentExtractor extractor)
+ throws SAXException, IOException, TikaException{
+
+ if (file == null) {
+ //skip silently
+ return;
+ }
+
+ fileName = (fileName == null) ? defaultName : fileName;
+
+ // TODO: other metadata?
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+ metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+
+ if (extractor.shouldParseEmbedded(metadata)) {
+ TikaInputStream stream = null;
+ try{
+ stream = TikaInputStream.get(file.createInputStream());
+ extractor.parseEmbedded(
+ stream,
+ new EmbeddedContentHandler(handler),
+ metadata, false);
+ } finally {
+ IOUtils.closeQuietly(stream);
}
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java?rev=1613501&r1=1613500&r2=1613501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/TikaTest.java Fri Jul 25 19:10:40 2014
@@ -44,6 +44,7 @@ import org.apache.tika.sax.BodyContentHa
import org.apache.tika.sax.ToXMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
/**
* Parent class of Tika tests
@@ -199,7 +200,8 @@ public abstract class TikaTest {
* Stores metadata and (optionally) content.
* Many thanks to Jukka's example:
* http://wiki.apache.org/tika/RecursiveMetadata
- *
+ * This ignores the incoming handler and applies a
+ * new BodyContentHandler(-1) for each file.
*/
public static class RecursiveMetadataParser extends ParserDecorator {
/** Key for content string if stored */
@@ -218,10 +220,16 @@ public abstract class TikaTest {
@Override
public void parse(
- InputStream stream, ContentHandler contentHandler,
+ InputStream stream, ContentHandler ignoredHandler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
+ ContentHandler contentHandler = null;
+ if (storeContent) {
+ contentHandler = new BodyContentHandler(-1);
+ } else {
+ contentHandler = new DefaultHandler();
+ }
super.parse(stream, contentHandler, metadata, context);
if (storeContent) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1613501&r1=1613500&r2=1613501&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Jul 25 19:10:40 2014
@@ -964,7 +964,7 @@ public class PDFParserTest extends TikaT
assertEquals(2, attach);
}
- @Test
+ @Test //TIKA-1376
public void testEmbeddedFileNameExtraction() throws Exception {
InputStream is = PDFParserTest.class.getResourceAsStream(
"/test-documents/testPDF_multiFormatEmbFiles.pdf");
@@ -976,11 +976,36 @@ public class PDFParserTest extends TikaT
p.parse(is, h, m, c);
is.close();
List<Metadata> metadatas = p.getAllMetadata();
- assertEquals("metadata size", 2, metadatas.size());
+ assertEquals("metadata size", 5, metadatas.size());
Metadata firstAttachment = metadatas.get(0);
assertEquals("attachment file name", "Test.txt", firstAttachment.get(Metadata.RESOURCE_NAME_KEY));
}
+ @Test //TIKA-1374
+ public void testOSSpecificEmbeddedFileExtraction() throws Exception {
+ InputStream is = PDFParserTest.class.getResourceAsStream(
+ "/test-documents/testPDF_multiFormatEmbFiles.pdf");
+ RecursiveMetadataParser p = new RecursiveMetadataParser(new AutoDetectParser(), true);
+ Metadata m = new Metadata();
+ ParseContext c = new ParseContext();
+ c.set(org.apache.tika.parser.Parser.class, p);
+ ContentHandler h = new BodyContentHandler();
+ p.parse(is, h, m, c);
+ is.close();
+ List<Metadata> metadatas = p.getAllMetadata();
+ assertEquals("metadata size", 5, metadatas.size());
+
+ assertEquals("file name", "Test.txt", metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
+ assertContains("os specific", metadatas.get(0).get(RecursiveMetadataParser.TIKA_CONTENT));
+ assertEquals("file name", "TestMac.txt", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
+ assertContains("mac embedded", metadatas.get(1).get(RecursiveMetadataParser.TIKA_CONTENT));
+ assertEquals("file name", "TestDos.txt", metadatas.get(2).get(Metadata.RESOURCE_NAME_KEY));
+ assertContains("dos embedded", metadatas.get(2).get(RecursiveMetadataParser.TIKA_CONTENT));
+ assertEquals("file name", "TestUnix.txt", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
+ assertContains("unix embedded", metadatas.get(3).get(RecursiveMetadataParser.TIKA_CONTENT));
+
+ }
+
/**
*
* Simple class to count end of document events. If functionality is useful,