You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/07/25 17:01:59 UTC
svn commit: r1613444 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
test/java/org/apache/tika/parser/pdf/PDFParserTest.java
test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf
Author: tallison
Date: Fri Jul 25 15:01:58 2014
New Revision: 1613444
URL: http://svn.apache.org/r1613444
Log:
TIKA-1376: improve embedded file name extraction in PDFParser
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1613444&r1=1613443&r2=1613444&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Fri Jul 25 15:01:58 2014
@@ -483,8 +483,11 @@ class PDF2XHTML extends PDFTextStripper
continue;
}
Metadata metadata = new Metadata();
+ String actualFileName = spec.getFile();
+ actualFileName = (actualFileName == null) ? ent.getKey() : actualFileName;
+
// TODO: other metadata?
- metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, actualFileName);
metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1613444&r1=1613443&r2=1613444&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Jul 25 15:01:58 2014
@@ -528,7 +528,7 @@ public class PDFParserTest extends TikaT
assertEquals(3, tracker.mediaTypes.size());
assertEquals("image1.emf", tracker.filenames.get(0));
assertNull(tracker.filenames.get(1));
- assertEquals("My first attachment", tracker.filenames.get(2));
+ assertEquals("Test.docx", tracker.filenames.get(2));
assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
@@ -964,6 +964,22 @@ public class PDFParserTest extends TikaT
assertEquals(2, attach);
}
+ @Test
+ public void testEmbeddedFileNameExtraction() throws Exception {
+ InputStream is = PDFParserTest.class.getResourceAsStream(
+ "/test-documents/testPDF_multiFormatEmbFiles.pdf");
+ RecursiveMetadataParser p = new RecursiveMetadataParser(new AutoDetectParser(), false);
+ Metadata m = new Metadata();
+ ParseContext c = new ParseContext();
+ c.set(org.apache.tika.parser.Parser.class, p);
+ ContentHandler h = new BodyContentHandler();
+ p.parse(is, h, m, c);
+ is.close();
+ List<Metadata> metadatas = p.getAllMetadata();
+ assertEquals("metadata size", 2, metadatas.size());
+ Metadata firstAttachment = metadatas.get(0);
+ assertEquals("attachment file name", "Test.txt", firstAttachment.get(Metadata.RESOURCE_NAME_KEY));
+ }
/**
*
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf?rev=1613444&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream