You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/09 11:00:44 UTC
svn commit: r1167052 - in /tika/trunk/tika-parsers/src/test:
java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
resources/test-documents/EmbeddedOutlook.docx
resources/test-documents/EmbeddedPDF.docx
Author: jukka
Date: Fri Sep 9 09:00:43 2011
New Revision: 1167052
URL: http://svn.apache.org/viewvc?rev=1167052&view=rev
Log:
TIKA-704: PDF and Outlook docs embedded in MS Word documents not parsed
Add test cases contributed by Jeremy Andersson
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx
tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java?rev=1167052&r1=1167051&r2=1167052&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java Fri Sep 9 09:00:43 2011
@@ -16,11 +16,9 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
-import org.apache.tika.detect.ContainerAwareDetector;
+import org.apache.tika.Tika;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.mime.MimeTypes;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;
/**
@@ -32,11 +30,9 @@ public class OOXMLContainerExtractionTes
@Override
protected void setUp() throws Exception {
- ContainerAwareDetector detector = new ContainerAwareDetector(
- MimeTypes.getDefaultMimeTypes());
- extractor = new ParserContainerExtractor(
- new AutoDetectParser(detector), detector
- );
+ Tika tika = new Tika();
+ extractor = new ParserContainerExtractor(
+ tika.getParser(), tika.getDetector());
}
/**
@@ -259,4 +255,39 @@ public class OOXMLContainerExtractionTes
assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(8)); // Icon of embedded office doc
}
+
+ public void testEmbeddedOutlook() throws Exception {
+ TrackingHandler handler =
+ process("EmbeddedOutlook.docx", extractor, false);
+
+ assertEquals(2, handler.filenames.size());
+ assertEquals(2, handler.mediaTypes.size());
+
+ assertEquals("image1.emf", handler.filenames.get(0));
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
+
+ assertEquals("licensedTestMsgwAtt.msg", handler.filenames.get(1));
+ assertEquals(TYPE_MSG, handler.mediaTypes.get(1));
+ }
+
+ public void testEmbeddedPDF() throws Exception {
+ TrackingHandler handler =
+ process("EmbeddedPDF.docx", extractor, false);
+
+ assertEquals(4, handler.filenames.size());
+ assertEquals(4, handler.mediaTypes.size());
+
+ assertEquals("image1.emf", handler.filenames.get(0));
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
+
+ assertNull(handler.filenames.get(1));
+ assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
+
+ assertEquals("image2.emf", handler.filenames.get(2));
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(2));
+
+ assertNull(handler.filenames.get(3));
+ assertEquals(TYPE_PDF, handler.mediaTypes.get(3));
+ }
+
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx?rev=1167052&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx Fri Sep 9 09:00:43 2011 differ
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx?rev=1167052&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx Fri Sep 9 09:00:43 2011 differ