You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/12 13:31:11 UTC

svn commit: r1034373 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java

Author: maxcom
Date: Fri Nov 12 12:31:11 2010
New Revision: 1034373

URL: http://svn.apache.org/viewvc?rev=1034373&view=rev
Log:
TIKA-550 - Add stable filenames for extracted embedded files from Office binaries

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1034373&r1=1034372&r2=1034373&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Fri Nov 12 12:31:11 2010
@@ -124,6 +124,7 @@ abstract class AbstractPOIFSExtractor {
                }
            } else {
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+               metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
 
                embedded = TikaInputStream.get(tmpFile);
            }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1034373&r1=1034372&r2=1034373&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Fri Nov 12 12:31:11 2010
@@ -122,8 +122,8 @@ public class POIContainerExtractionTest 
        assertEquals(null, handler.filenames.get(0));
        assertEquals(null, handler.filenames.get(1));
        assertEquals(null, handler.filenames.get(2));
-       assertEquals(null, handler.filenames.get(3));
-       assertEquals(null, handler.filenames.get(4));
+       assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+       assertEquals("MBD00032A24.doc", handler.filenames.get(4));
        // But we do know their types
        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
@@ -140,7 +140,7 @@ public class POIContainerExtractionTest 
        assertEquals(null, handler.filenames.get(0));
        assertEquals(null, handler.filenames.get(1));
        assertEquals(null, handler.filenames.get(2));
-       assertEquals(null, handler.filenames.get(3));
+       assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
        assertEquals("1", handler.filenames.get(4));
        assertEquals(null, handler.filenames.get(5));
        assertEquals("2", handler.filenames.get(6));
@@ -176,8 +176,8 @@ public class POIContainerExtractionTest 
        assertEquals("image2", handler.filenames.get(4));
        assertEquals("image3", handler.filenames.get(5));
        assertEquals(null, handler.filenames.get(6));
-       assertEquals(null, handler.filenames.get(7));
-       assertEquals(null, handler.filenames.get(8));
+       assertEquals("_1345471035.ppt", handler.filenames.get(7));
+       assertEquals("_1345470949.xls", handler.filenames.get(8));
        
        // But we do know their types
        assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(0)); // Icon of embedded office doc?
@@ -207,7 +207,7 @@ public class POIContainerExtractionTest 
        assertEquals("image2.png", handler.filenames.get(7));
        assertEquals("image3.jpeg", handler.filenames.get(8));
        assertEquals("image4.png", handler.filenames.get(9));
-       for(int i=10; i<handler.filenames.size(); i++) {
+       for(int i=12; i<handler.filenames.size(); i++) {
           assertNull(handler.filenames.get(i));
        }
        // But we do know their types
@@ -247,7 +247,7 @@ public class POIContainerExtractionTest 
        assertEquals(2, handler.filenames.size());
        assertEquals(2, handler.mediaTypes.size());
        
-       assertEquals(null, handler.filenames.get(0));
+       assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
        assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
        
        assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));