You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/09 11:00:44 UTC

svn commit: r1167052 - in /tika/trunk/tika-parsers/src/test: java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java resources/test-documents/EmbeddedOutlook.docx resources/test-documents/EmbeddedPDF.docx

Author: jukka
Date: Fri Sep  9 09:00:43 2011
New Revision: 1167052

URL: http://svn.apache.org/viewvc?rev=1167052&view=rev
Log:
TIKA-704: PDF and Outlook docs embedded in MS Word documents not parsed

Add test cases contributed by Jeremy Andersson

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx
    tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx
Modified:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java?rev=1167052&r1=1167051&r2=1167052&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java Fri Sep  9 09:00:43 2011
@@ -16,11 +16,9 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
-import org.apache.tika.detect.ContainerAwareDetector;
+import org.apache.tika.Tika;
 import org.apache.tika.extractor.ContainerExtractor;
 import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.mime.MimeTypes;
-import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;
 
 /**
@@ -32,11 +30,9 @@ public class OOXMLContainerExtractionTes
     
     @Override
     protected void setUp() throws Exception {
-       ContainerAwareDetector detector = new ContainerAwareDetector(
-               MimeTypes.getDefaultMimeTypes());
-       extractor = new ParserContainerExtractor(
-             new AutoDetectParser(detector), detector
-       );
+        Tika tika = new Tika();
+        extractor = new ParserContainerExtractor(
+                tika.getParser(), tika.getDetector());
     }
 
    /**
@@ -259,4 +255,39 @@ public class OOXMLContainerExtractionTes
        assertEquals(TYPE_EMF, handler.mediaTypes.get(7));  // Icon of embedded office doc
        assertEquals(TYPE_EMF, handler.mediaTypes.get(8));  // Icon of embedded office doc
     }
+
+    public void testEmbeddedOutlook() throws Exception {
+        TrackingHandler handler =
+                process("EmbeddedOutlook.docx", extractor, false);
+
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
+
+        assertEquals("licensedTestMsgwAtt.msg", handler.filenames.get(1));
+        assertEquals(TYPE_MSG, handler.mediaTypes.get(1));
+    }
+
+    public void testEmbeddedPDF() throws Exception {
+        TrackingHandler handler =
+                process("EmbeddedPDF.docx", extractor, false);
+
+        assertEquals(4, handler.filenames.size());
+        assertEquals(4, handler.mediaTypes.size());
+
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
+
+        assertNull(handler.filenames.get(1));
+        assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
+
+        assertEquals("image2.emf", handler.filenames.get(2));
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(2));
+
+        assertNull(handler.filenames.get(3));
+        assertEquals(TYPE_PDF, handler.mediaTypes.get(3));
+    }
+
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx?rev=1167052&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedOutlook.docx Fri Sep  9 09:00:43 2011 differ

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx?rev=1167052&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/EmbeddedPDF.docx Fri Sep  9 09:00:43 2011 differ