You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/12 13:31:11 UTC
svn commit: r1034373 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
Author: maxcom
Date: Fri Nov 12 12:31:11 2010
New Revision: 1034373
URL: http://svn.apache.org/viewvc?rev=1034373&view=rev
Log:
TIKA-550 - Add stable filenames for extracted embedded files from Office binaries
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1034373&r1=1034372&r2=1034373&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Fri Nov 12 12:31:11 2010
@@ -124,6 +124,7 @@ abstract class AbstractPOIFSExtractor {
}
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
embedded = TikaInputStream.get(tmpFile);
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1034373&r1=1034372&r2=1034373&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Fri Nov 12 12:31:11 2010
@@ -122,8 +122,8 @@ public class POIContainerExtractionTest
assertEquals(null, handler.filenames.get(0));
assertEquals(null, handler.filenames.get(1));
assertEquals(null, handler.filenames.get(2));
- assertEquals(null, handler.filenames.get(3));
- assertEquals(null, handler.filenames.get(4));
+ assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+ assertEquals("MBD00032A24.doc", handler.filenames.get(4));
// But we do know their types
assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
@@ -140,7 +140,7 @@ public class POIContainerExtractionTest
assertEquals(null, handler.filenames.get(0));
assertEquals(null, handler.filenames.get(1));
assertEquals(null, handler.filenames.get(2));
- assertEquals(null, handler.filenames.get(3));
+ assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
assertEquals("1", handler.filenames.get(4));
assertEquals(null, handler.filenames.get(5));
assertEquals("2", handler.filenames.get(6));
@@ -176,8 +176,8 @@ public class POIContainerExtractionTest
assertEquals("image2", handler.filenames.get(4));
assertEquals("image3", handler.filenames.get(5));
assertEquals(null, handler.filenames.get(6));
- assertEquals(null, handler.filenames.get(7));
- assertEquals(null, handler.filenames.get(8));
+ assertEquals("_1345471035.ppt", handler.filenames.get(7));
+ assertEquals("_1345470949.xls", handler.filenames.get(8));
// But we do know their types
assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(0)); // Icon of embedded office doc?
@@ -207,7 +207,7 @@ public class POIContainerExtractionTest
assertEquals("image2.png", handler.filenames.get(7));
assertEquals("image3.jpeg", handler.filenames.get(8));
assertEquals("image4.png", handler.filenames.get(9));
- for(int i=10; i<handler.filenames.size(); i++) {
+ for(int i=12; i<handler.filenames.size(); i++) {
assertNull(handler.filenames.get(i));
}
// But we do know their types
@@ -247,7 +247,7 @@ public class POIContainerExtractionTest
assertEquals(2, handler.filenames.size());
assertEquals(2, handler.mediaTypes.size());
- assertEquals(null, handler.filenames.get(0));
+ assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));