You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2014/07/25 17:01:59 UTC

svn commit: r1613444 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/pdf/PDF2XHTML.java test/java/org/apache/tika/parser/pdf/PDFParserTest.java test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf

Author: tallison
Date: Fri Jul 25 15:01:58 2014
New Revision: 1613444

URL: http://svn.apache.org/r1613444
Log:
TIKA-1376: improve embedded file name extraction in PDFParser

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java?rev=1613444&r1=1613443&r2=1613444&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java Fri Jul 25 15:01:58 2014
@@ -483,8 +483,11 @@ class PDF2XHTML extends PDFTextStripper 
                 continue;
             }
             Metadata metadata = new Metadata();
+            String actualFileName = spec.getFile();
+            actualFileName = (actualFileName == null) ? ent.getKey() : actualFileName;
+
             // TODO: other metadata?
-            metadata.set(Metadata.RESOURCE_NAME_KEY, ent.getKey());
+            metadata.set(Metadata.RESOURCE_NAME_KEY, actualFileName);
             metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
             metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
             metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, 

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java?rev=1613444&r1=1613443&r2=1613444&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java Fri Jul 25 15:01:58 2014
@@ -528,7 +528,7 @@ public class PDFParserTest extends TikaT
        assertEquals(3, tracker.mediaTypes.size());
        assertEquals("image1.emf", tracker.filenames.get(0));
        assertNull(tracker.filenames.get(1));
-       assertEquals("My first attachment", tracker.filenames.get(2));
+       assertEquals("Test.docx", tracker.filenames.get(2));
        assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
        assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
        assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
@@ -964,6 +964,22 @@ public class PDFParserTest extends TikaT
         assertEquals(2, attach);
     }
 
+    @Test
+    public void testEmbeddedFileNameExtraction() throws Exception {
+        InputStream is = PDFParserTest.class.getResourceAsStream(
+                "/test-documents/testPDF_multiFormatEmbFiles.pdf");
+        RecursiveMetadataParser p = new RecursiveMetadataParser(new AutoDetectParser(), false);
+        Metadata m = new Metadata();
+        ParseContext c = new ParseContext();
+        c.set(org.apache.tika.parser.Parser.class, p);
+        ContentHandler h = new BodyContentHandler();
+        p.parse(is, h, m, c);
+        is.close();
+        List<Metadata> metadatas = p.getAllMetadata();
+        assertEquals("metadata size", 2, metadatas.size());
+        Metadata firstAttachment = metadatas.get(0);
+        assertEquals("attachment file name", "Test.txt", firstAttachment.get(Metadata.RESOURCE_NAME_KEY));
+    }
 
     /**
      * 

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf?rev=1613444&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testPDF_multiFormatEmbFiles.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream