You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:22 UTC

[16/39] tika git commit: Convert new lines from windows to unix

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index 3d28b35..25e567f 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -1,382 +1,382 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-
-import java.util.List;
-
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaMetadataKeys;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.RecursiveParserWrapper;
-import org.junit.Test;
-
-/**
- * Tests that the various POI powered parsers are
- * able to extract their embedded contents.
- */
-public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTest {
-
-    /**
-     * For office files which don't have anything embedded in them
-     */
-    @Test
-    public void testWithoutEmbedded() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-
-        String[] files = new String[]{
-                "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
-                "testVISIO.vsd", "test-outlook.msg"
-        };
-        for (String file : files) {
-            // Process it without recursing
-            TrackingHandler handler = process(file, extractor, false);
-
-            // Won't have fired
-            assertEquals(0, handler.filenames.size());
-            assertEquals(0, handler.mediaTypes.size());
-
-            // Ditto with recursing
-            handler = process(file, extractor, true);
-            assertEquals(0, handler.filenames.size());
-            assertEquals(0, handler.mediaTypes.size());
-        }
-    }
-
-    /**
-     * Office files with embedded images, but no other
-     * office files in them
-     */
-    @Test
-    public void testEmbeddedImages() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-        TrackingHandler handler;
-
-        // Excel with 1 image
-        handler = process("testEXCEL_1img.xls", extractor, false);
-        assertEquals(1, handler.filenames.size());
-        assertEquals(1, handler.mediaTypes.size());
-
-        assertEquals(null, handler.filenames.get(0));
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-
-        // PowerPoint with 2 images + sound
-        // TODO
-
-
-        // Word with 1 image
-        handler = process("testWORD_1img.doc", extractor, false);
-        assertEquals(1, handler.filenames.size());
-        assertEquals(1, handler.mediaTypes.size());
-
-        assertEquals("image1.png", handler.filenames.get(0));
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-
-
-        // Word with 3 images
-        handler = process("testWORD_3imgs.doc", extractor, false);
-        assertEquals(3, handler.filenames.size());
-        assertEquals(3, handler.mediaTypes.size());
-
-        assertEquals("image1.png", handler.filenames.get(0));
-        assertEquals("image2.jpg", handler.filenames.get(1));
-        assertEquals("image3.png", handler.filenames.get(2));
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
-    }
-
-    /**
-     * Office files which have other office files
-     * embedded into them. The embedded office files
-     * will sometimes have images in them.
-     * <p/>
-     * eg xls
-     * -> word
-     * -> image
-     * -> image
-     * -> powerpoint
-     * -> excel
-     * -> image
-     */
-    @Test
-    public void testEmbeddedOfficeFiles() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-        TrackingHandler handler;
-
-
-        // Excel with a word doc and a powerpoint doc, both of which have images in them
-        // Without recursion, should see both documents + the images
-        handler = process("testEXCEL_embeded.xls", extractor, false);
-        assertEquals(5, handler.filenames.size());
-        assertEquals(5, handler.mediaTypes.size());
-
-        // We don't know their filenames
-        assertEquals(null, handler.filenames.get(0));
-        assertEquals(null, handler.filenames.get(1));
-        assertEquals(null, handler.filenames.get(2));
-        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
-        assertEquals("MBD00032A24.doc", handler.filenames.get(4));
-        // But we do know their types
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
-        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc
-
-
-        // With recursion, should get the images embedded in the office files too
-        handler = process("testEXCEL_embeded.xls", extractor, true);
-        assertEquals(17, handler.filenames.size());
-        assertEquals(17, handler.mediaTypes.size());
-
-        assertEquals(null, handler.filenames.get(0));
-        assertEquals(null, handler.filenames.get(1));
-        assertEquals(null, handler.filenames.get(2));
-        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
-        assertEquals("1", handler.filenames.get(4));
-        assertEquals(null, handler.filenames.get(5));
-        assertEquals("2", handler.filenames.get(6));
-        assertEquals("image1.png", handler.filenames.get(7));
-        assertEquals("image2.jpg", handler.filenames.get(8));
-        assertEquals("image3.png", handler.filenames.get(9));
-        assertEquals("image1.png", handler.filenames.get(16));
-
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
-        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation
-        assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
-
-        // Word with .docx, powerpoint and excel
-        handler = process("testWORD_embeded.doc", extractor, false);
-        assertEquals(9, handler.filenames.size());
-        assertEquals(9, handler.mediaTypes.size());
-
-        // Filenames are a bit iffy...
-        // Should really be 3*embedded pictures then 3*icons then embedded docs
-        assertEquals("image1.emf", handler.filenames.get(0));
-        assertEquals("image4.png", handler.filenames.get(1));
-        assertEquals("image5.jpg", handler.filenames.get(2));
-        assertEquals("image6.png", handler.filenames.get(3));
-        assertEquals("image2.emf", handler.filenames.get(4));
-        assertEquals("image3.emf", handler.filenames.get(5));
-        assertEquals(null, handler.filenames.get(6));
-        assertEquals("_1345471035.ppt", handler.filenames.get(7));
-        assertEquals("_1345470949.xls", handler.filenames.get(8));
-
-        // But we do know their types
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc?
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc?
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc?
-        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
-        assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office doc
-        assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office doc
-
-
-        // With recursion, should get their images too
-        handler = process("testWORD_embeded.doc", extractor, true);
-        assertEquals(16, handler.filenames.size());
-        assertEquals(16, handler.mediaTypes.size());
-
-        // We don't know their filenames, except for doc images + docx
-        assertEquals("image1.emf", handler.filenames.get(0));
-        assertEquals("image4.png", handler.filenames.get(1));
-        assertEquals("image5.jpg", handler.filenames.get(2));
-        assertEquals("image6.png", handler.filenames.get(3));
-        assertEquals("image2.emf", handler.filenames.get(4));
-        assertEquals("image3.emf", handler.filenames.get(5));
-        assertEquals(null, handler.filenames.get(6));
-        assertEquals("image2.png", handler.filenames.get(7));
-        assertEquals("image3.jpeg", handler.filenames.get(8));
-        assertEquals("image4.png", handler.filenames.get(9));
-        for (int i = 11; i < 14; i++) {
-            assertNull(handler.filenames.get(i));
-        }
-        // But we do know their types
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc
-        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(7));  //    PNG inside .docx
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(8));  //    JPG inside .docx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(9));  //    PNG inside .docx
-        assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc
-        assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); //    PNG inside .xls
-
-
-        // PowerPoint with excel and word
-        handler = process("testPPT_embeded.ppt", extractor, false);
-        assertEquals(7, handler.filenames.size());
-        assertEquals(7, handler.mediaTypes.size());
-
-        // We don't get all that helpful filenames
-        assertEquals("1", handler.filenames.get(0));
-        assertEquals("2", handler.filenames.get(1));
-        assertEquals(null, handler.filenames.get(2));
-        assertEquals(null, handler.filenames.get(3));
-        assertEquals(null, handler.filenames.get(4));
-        assertEquals(null, handler.filenames.get(5));
-        assertEquals(null, handler.filenames.get(6));
-        // But we do know their types
-        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image
-
-        // Run again on PowerPoint but with recursion
-        handler = process("testPPT_embeded.ppt", extractor, true);
-        assertEquals(11, handler.filenames.size());
-        assertEquals(11, handler.mediaTypes.size());
-
-        assertEquals("1", handler.filenames.get(0));
-        assertEquals(null, handler.filenames.get(1));
-        assertEquals("2", handler.filenames.get(2));
-        assertEquals("image1.png", handler.filenames.get(3));
-        assertEquals("image2.jpg", handler.filenames.get(4));
-        assertEquals("image3.png", handler.filenames.get(5));
-        assertEquals(null, handler.filenames.get(6));
-        assertEquals(null, handler.filenames.get(7));
-        assertEquals(null, handler.filenames.get(8));
-        assertEquals(null, handler.filenames.get(9));
-        assertEquals(null, handler.filenames.get(10));
-
-        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); //    PNG inside .xls
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //    PNG inside .docx
-        assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  //    JPG inside .docx
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  //    PNG inside .docx
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
-        assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image
-
-
-        // Word, with a non-office file (PDF)
-        handler = process("testWORD_embedded_pdf.doc", extractor, true);
-        assertEquals(2, handler.filenames.size());
-        assertEquals(2, handler.mediaTypes.size());
-
-        assertEquals("image1.emf", handler.filenames.get(0));
-        assertEquals("_1402837031.pdf", handler.filenames.get(1));
-
-        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded pdf
-        assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF itself
-
-
-        // Outlook with a text file and a word document
-        handler = process("testMSG_att_doc.msg", extractor, true);
-        assertEquals(2, handler.filenames.size());
-        assertEquals(2, handler.mediaTypes.size());
-
-        assertEquals("test-unicode.doc", handler.filenames.get(0));
-        assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
-
-        assertEquals("pj1.txt", handler.filenames.get(1));
-        assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
-
-
-        // Outlook with a pdf and another outlook message
-        handler = process("testMSG_att_msg.msg", extractor, true);
-        assertEquals(2, handler.filenames.size());
-        assertEquals(2, handler.mediaTypes.size());
-
-        assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
-        assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
-
-        assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
-        assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
-    }
-
-    @Test
-    public void testEmbeddedOfficeFilesXML() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-        TrackingHandler handler;
-
-        handler = process("EmbeddedDocument.docx", extractor, false);
-        assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
-        assertEquals(2, handler.filenames.size());
-    }
-
-    @Test
-    public void testPowerpointImages() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-        TrackingHandler handler;
-
-        handler = process("pictures.ppt", extractor, false);
-        assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
-        assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
-    }
-
-    @Test
-    public void testEmbeddedStorageId() throws Exception {
-
-        List<Metadata> list = getRecursiveJson("testWORD_embeded.doc");
-        //.docx
-        assertEquals("{F4754C9B-64F5-4B40-8AF4-679732AC0607}",
-                list.get(10).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
-        //_1345471035.ppt
-        assertEquals("{64818D10-4F9B-11CF-86EA-00AA00B929E8}",
-                list.get(14).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
-        //_1345470949.xls
-        assertEquals("{00020820-0000-0000-C000-000000000046}",
-                list.get(16).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
-
-    }
-
-    @Test
-    public void testEmbeddedGraphChart() throws Exception {
-        //doc converts a chart to a actual xls file
-        //so we only need to look in ppt and xls
-        for (String suffix : new String[]{"ppt", "xls"}) {
-            List<Metadata> list = getRecursiveJson("testMSChart-govdocs-428996."+suffix);
-            boolean found = false;
-            for (Metadata m : list) {
-                if (m.get(Metadata.CONTENT_TYPE).equals(POIFSContainerDetector.MS_GRAPH_CHART.toString())) {
-                    found = true;
-                }
-                assertNull(m.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
-            }
-            assertTrue("didn't find chart in "+suffix, found);
-        }
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.util.List;
+
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Test;
+
+/**
+ * Tests that the various POI powered parsers are
+ * able to extract their embedded contents.
+ */
+public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTest {
+
+    /**
+     * For office files which don't have anything embedded in them
+     */
+    @Test
+    public void testWithoutEmbedded() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+
+        String[] files = new String[]{
+                "testEXCEL.xls", "testWORD.doc", "testPPT.ppt",
+                "testVISIO.vsd", "test-outlook.msg"
+        };
+        for (String file : files) {
+            // Process it without recursing
+            TrackingHandler handler = process(file, extractor, false);
+
+            // Won't have fired
+            assertEquals(0, handler.filenames.size());
+            assertEquals(0, handler.mediaTypes.size());
+
+            // Ditto with recursing
+            handler = process(file, extractor, true);
+            assertEquals(0, handler.filenames.size());
+            assertEquals(0, handler.mediaTypes.size());
+        }
+    }
+
+    /**
+     * Office files with embedded images, but no other
+     * office files in them
+     */
+    @Test
+    public void testEmbeddedImages() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        // Excel with 1 image
+        handler = process("testEXCEL_1img.xls", extractor, false);
+        assertEquals(1, handler.filenames.size());
+        assertEquals(1, handler.mediaTypes.size());
+
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+        // PowerPoint with 2 images + sound
+        // TODO
+
+
+        // Word with 1 image
+        handler = process("testWORD_1img.doc", extractor, false);
+        assertEquals(1, handler.filenames.size());
+        assertEquals(1, handler.mediaTypes.size());
+
+        assertEquals("image1.png", handler.filenames.get(0));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
+
+        // Word with 3 images
+        handler = process("testWORD_3imgs.doc", extractor, false);
+        assertEquals(3, handler.filenames.size());
+        assertEquals(3, handler.mediaTypes.size());
+
+        assertEquals("image1.png", handler.filenames.get(0));
+        assertEquals("image2.jpg", handler.filenames.get(1));
+        assertEquals("image3.png", handler.filenames.get(2));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
+    }
+
+    /**
+     * Office files which have other office files
+     * embedded into them. The embedded office files
+     * will sometimes have images in them.
+     * <p/>
+     * eg xls
+     * -> word
+     * -> image
+     * -> image
+     * -> powerpoint
+     * -> excel
+     * -> image
+     */
+    @Test
+    public void testEmbeddedOfficeFiles() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+
+        // Excel with a word doc and a powerpoint doc, both of which have images in them
+        // Without recursion, should see both documents + the images
+        handler = process("testEXCEL_embeded.xls", extractor, false);
+        assertEquals(5, handler.filenames.size());
+        assertEquals(5, handler.mediaTypes.size());
+
+        // We don't know their filenames
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+        assertEquals("MBD00032A24.doc", handler.filenames.get(4));
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded office doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embedded office doc
+
+
+        // With recursion, should get the images embedded in the office files too
+        handler = process("testEXCEL_embeded.xls", extractor, true);
+        assertEquals(17, handler.filenames.size());
+        assertEquals(17, handler.mediaTypes.size());
+
+        assertEquals(null, handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
+        assertEquals("1", handler.filenames.get(4));
+        assertEquals(null, handler.filenames.get(5));
+        assertEquals("2", handler.filenames.get(6));
+        assertEquals("image1.png", handler.filenames.get(7));
+        assertEquals("image2.jpg", handler.filenames.get(8));
+        assertEquals("image3.png", handler.filenames.get(9));
+        assertEquals("image1.png", handler.filenames.get(16));
+
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embedded image
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embedded presentation
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(4)); // Embedded XLS
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(6)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
+
+        // Word with .docx, powerpoint and excel
+        handler = process("testWORD_embeded.doc", extractor, false);
+        assertEquals(9, handler.filenames.size());
+        assertEquals(9, handler.mediaTypes.size());
+
+        // Filenames are a bit iffy...
+        // Should really be 3*embedded pictures then 3*icons then embedded docs
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("image4.png", handler.filenames.get(1));
+        assertEquals("image5.jpg", handler.filenames.get(2));
+        assertEquals("image6.png", handler.filenames.get(3));
+        assertEquals("image2.emf", handler.filenames.get(4));
+        assertEquals("image3.emf", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals("_1345471035.ppt", handler.filenames.get(7));
+        assertEquals("_1345470949.xls", handler.filenames.get(8));
+
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc?
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc?
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc?
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(7)); // Embedded office doc
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(8)); // Embedded office doc
+
+
+        // With recursion, should get their images too
+        handler = process("testWORD_embeded.doc", extractor, true);
+        assertEquals(16, handler.filenames.size());
+        assertEquals(16, handler.mediaTypes.size());
+
+        // We don't know their filenames, except for doc images + docx
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("image4.png", handler.filenames.get(1));
+        assertEquals("image5.jpg", handler.filenames.get(2));
+        assertEquals("image6.png", handler.filenames.get(3));
+        assertEquals("image2.emf", handler.filenames.get(4));
+        assertEquals("image3.emf", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals("image2.png", handler.filenames.get(7));
+        assertEquals("image3.jpeg", handler.filenames.get(8));
+        assertEquals("image4.png", handler.filenames.get(9));
+        for (int i = 11; i < 14; i++) {
+            assertNull(handler.filenames.get(i));
+        }
+        // But we do know their types
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); // Embedded image - logo
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(2)); // Embedded image - safe
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // Embedded image - try
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(4)); // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(5)); // Icon of embedded office doc
+        assertEquals(TYPE_DOCX, handler.mediaTypes.get(6)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(7));  //    PNG inside .docx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(8));  //    JPG inside .docx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9));  //    PNG inside .docx
+        assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); //    PNG inside .xls
+
+
+        // PowerPoint with excel and word
+        handler = process("testPPT_embeded.ppt", extractor, false);
+        assertEquals(7, handler.filenames.size());
+        assertEquals(7, handler.mediaTypes.size());
+
+        // We don't get all that helpful filenames
+        assertEquals("1", handler.filenames.get(0));
+        assertEquals("2", handler.filenames.get(1));
+        assertEquals(null, handler.filenames.get(2));
+        assertEquals(null, handler.filenames.get(3));
+        assertEquals(null, handler.filenames.get(4));
+        assertEquals(null, handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        // But we do know their types
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(1)); // Embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(2)); // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(3)); // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(6)); // Embedded image
+
+        // Run again on PowerPoint but with recursion
+        handler = process("testPPT_embeded.ppt", extractor, true);
+        assertEquals(11, handler.filenames.size());
+        assertEquals(11, handler.mediaTypes.size());
+
+        assertEquals("1", handler.filenames.get(0));
+        assertEquals(null, handler.filenames.get(1));
+        assertEquals("2", handler.filenames.get(2));
+        assertEquals("image1.png", handler.filenames.get(3));
+        assertEquals("image2.jpg", handler.filenames.get(4));
+        assertEquals("image3.png", handler.filenames.get(5));
+        assertEquals(null, handler.filenames.get(6));
+        assertEquals(null, handler.filenames.get(7));
+        assertEquals(null, handler.filenames.get(8));
+        assertEquals(null, handler.filenames.get(9));
+        assertEquals(null, handler.filenames.get(10));
+
+        assertEquals(TYPE_XLS, handler.mediaTypes.get(0)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(1)); //    PNG inside .xls
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(2)); // Embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(3));  //    PNG inside .docx
+        assertEquals(TYPE_JPG, handler.mediaTypes.get(4));  //    JPG inside .docx
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(5));  //    PNG inside .docx
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(6)); // Icon of embedded office doc
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(7)); // Icon of embedded office doc
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
+        assertEquals(TYPE_PNG, handler.mediaTypes.get(10)); // Embedded image
+
+
+        // Word, with a non-office file (PDF)
+        handler = process("testWORD_embedded_pdf.doc", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("image1.emf", handler.filenames.get(0));
+        assertEquals("_1402837031.pdf", handler.filenames.get(1));
+
+        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded pdf
+        assertEquals(TYPE_PDF, handler.mediaTypes.get(1)); // The embedded PDF itself
+
+
+        // Outlook with a text file and a word document
+        handler = process("testMSG_att_doc.msg", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("test-unicode.doc", handler.filenames.get(0));
+        assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
+
+        assertEquals("pj1.txt", handler.filenames.get(1));
+        assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
+
+
+        // Outlook with a pdf and another outlook message
+        handler = process("testMSG_att_msg.msg", extractor, true);
+        assertEquals(2, handler.filenames.size());
+        assertEquals(2, handler.mediaTypes.size());
+
+        assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
+        assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
+
+        assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
+        assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
+    }
+
+    @Test
+    public void testEmbeddedOfficeFilesXML() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        handler = process("EmbeddedDocument.docx", extractor, false);
+        assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
+        assertEquals(2, handler.filenames.size());
+    }
+
+    @Test
+    public void testPowerpointImages() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        handler = process("pictures.ppt", extractor, false);
+        assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
+        assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
+    }
+
+    @Test
+    public void testEmbeddedStorageId() throws Exception {
+
+        List<Metadata> list = getRecursiveJson("testWORD_embeded.doc");
+        //.docx
+        assertEquals("{F4754C9B-64F5-4B40-8AF4-679732AC0607}",
+                list.get(10).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+        //_1345471035.ppt
+        assertEquals("{64818D10-4F9B-11CF-86EA-00AA00B929E8}",
+                list.get(14).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+        //_1345470949.xls
+        assertEquals("{00020820-0000-0000-C000-000000000046}",
+                list.get(16).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+
+    }
+
+    @Test
+    public void testEmbeddedGraphChart() throws Exception {
+        //doc converts a chart to a actual xls file
+        //so we only need to look in ppt and xls
+        for (String suffix : new String[]{"ppt", "xls"}) {
+            List<Metadata> list = getRecursiveJson("testMSChart-govdocs-428996."+suffix);
+            boolean found = false;
+            for (Metadata m : list) {
+                if (m.get(Metadata.CONTENT_TYPE).equals(POIFSContainerDetector.MS_GRAPH_CHART.toString())) {
+                    found = true;
+                }
+                assertNull(m.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+            }
+            assertTrue("didn't find chart in "+suffix, found);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 32d462e..79d53d2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -1,251 +1,251 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.io.InputStream;
-import java.util.List;
-import java.util.Locale;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLCore;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class PowerPointParserTest extends TikaTest {
-
-    @Test
-    public void testPowerPointParser() throws Exception {
-        try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT.ppt")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
-            assertEquals(
-                    "application/vnd.ms-powerpoint",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
-            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
-            String content = handler.toString();
-            assertContains("Sample Powerpoint Slide", content);
-            assertContains("Powerpoint X for Mac", content);
-        }
-    }
-
-    @Test
-    public void testVarious() throws Exception {
-        Metadata metadata = new Metadata();
-        String xml = getXML("testPPT_various.ppt", metadata).xml;
-        assertContains("<p>Footnote appears here", xml);
-        assertContains("<p>[1] This is a footnote.", xml);
-        assertContains("<p>This is the header text.</p>", xml);
-        assertContains("<p>This is the footer text.</p>", xml);
-        assertContains("<p>Here is a text box</p>", xml);
-        assertContains("<p>Bold ", xml);
-        assertContains("italic underline superscript subscript", xml);
-        assertContains("underline", xml);
-        assertContains("superscript", xml);
-        assertContains("subscript", xml);
-        assertContains("<p>Here is a citation:", xml);
-        assertContains("Figure 1 This is a caption for Figure 1", xml);
-        assertContains("(Kramer)", xml);
-        assertContains("<table><tr>\t<td>Row 1 Col 1</td>", xml);
-        assertContains("<td>Row 2 Col 2</td>\t<td>Row 2 Col 3</td></tr>", xml);
-        assertContains("<p>Row 1 column 1</p>", xml);
-        assertContains("<p>Row 2 column 2</p>", xml);
-        assertContains("<p><a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml);
-        assertContains("<p>Here is a list:", xml);
-        for(int row=1;row<=3;row++) {
-            //assertContains("�\tBullet " + row, content);
-            //assertContains("\u00b7\tBullet " + row, content);
-            assertContains("<li>Bullet " + row, xml);
-        }
-        assertContains("Here is a numbered list:", xml);
-        for(int row=1;row<=3;row++) {
-            //assertContains(row + ")\tNumber bullet " + row, content);
-            //assertContains(row + ") Number bullet " + row, content);
-            // TODO: OOXMLExtractor fails to number the bullets:
-            assertContains("<li>Number bullet " + row, xml);
-        }
-
-        for(int row=1;row<=2;row++) {
-            for(int col=1;col<=3;col++) {
-                assertContains("Row " + row + " Col " + col, xml);
-            }
-        }
-        assertContains("Keyword1 Keyword2", xml);
-        assertEquals("Keyword1 Keyword2",
-                     metadata.get(TikaCoreProperties.KEYWORDS));
-
-        assertContains("Subject is here", xml);
-        assertEquals("Subject is here",
-                     metadata.get(OfficeOpenXMLCore.SUBJECT));
-        // TODO: Remove subject in Tika 2.0
-        assertEquals("Subject is here",
-                     metadata.get(Metadata.SUBJECT));
-
-        assertContains("Suddenly some Japanese text:", xml);
-        // Special version of (GHQ)
-        assertContains("\uff08\uff27\uff28\uff31\uff09", xml);
-        // 6 other characters
-        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
-                xml);
-
-        assertContains("And then some Gothic text:", xml);
-        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
-                xml);
-    }
-
-    @Test
-    public void testMasterFooter() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT_masterFooter.ppt")) {
-            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
-        }
-
-        String content = handler.toString();
-        assertContains("Master footer is here", content);
-
-        // Make sure boilerplate text didn't come through:
-        assertEquals(-1, content.indexOf("Click to edit Master"));
-
-        //TIKA-1171
-        assertEquals(-1, content.indexOf("*"));
-    }
-
-    /**
-     * TIKA-712 Master Slide Text from PPT and PPTX files
-     *  should be extracted too
-     */
-    @Test
-    public void testMasterText() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT_masterText.ppt")) {
-            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
-        }
-
-        String content = handler.toString();
-        assertContains("Text that I added to the master slide", content);
-
-        // Make sure boilerplate text didn't come through:
-        assertEquals(-1, content.indexOf("Click to edit Master"));
-
-        //TIKA-1171
-        assertEquals(-1, content.indexOf("*"));
-    }
-
-    @Test
-    public void testMasterText2() throws Exception {
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT_masterText2.ppt")) {
-            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
-        }
-
-        String content = handler.toString();
-        assertContains("Text that I added to the master slide", content);
-
-        // Make sure boilerplate text didn't come through:
-        assertEquals(-1, content.indexOf("Click to edit Master"));
-        //TIKA-1171
-        assertEquals(-1, content.indexOf("*"));
-    }
-
-    /**
-     * Ensures that custom OLE2 (HPSF) properties are extracted
-     */
-    @Test
-    public void testCustomProperties() throws Exception {
-        Metadata metadata = new Metadata();
-
-        try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
-                "/test-documents/testPPT_custom_props.ppt")) {
-            ContentHandler handler = new BodyContentHandler(-1);
-            ParseContext context = new ParseContext();
-            context.set(Locale.class, Locale.US);
-            new OfficeParser().parse(input, handler, metadata, context);
-        }
-
-        assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
-        assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
-        assertEquals("2011-08-22T13:32:58Z", metadata.get(TikaCoreProperties.MODIFIED));
-        assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE));
-        assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
-        assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
-        assertEquals("1", metadata.get(Office.SLIDE_COUNT));
-        assertEquals("3", metadata.get(Office.WORD_COUNT));
-        assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("true", metadata.get("custom:myCustomBoolean"));
-        assertEquals("3", metadata.get("custom:myCustomNumber"));
-        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
-        assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
-        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
-    }
-
-    // TIKA-1025
-    @Test
-    public void testEmbeddedPlacedholder() throws Exception {
-        XMLResult result = getXML("testPPT_embedded2.ppt");
-        assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
-        assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
-    }
-
-    // TIKA-817
-    @Test
-    public void testAutoDatePPT() throws Exception {
-        //decision was made in POI-52367 not to generate
-        //autodate automatically.  For pptx, where value is stored,
-        //value is extracted.  For ppt, however, no date is extracted.
-        XMLResult result = getXML("testPPT_autodate.ppt");
-        assertContains(
-                "<div class=\"slide-content\"><p>Now</p>",
-                result.xml);
-    }
-
-    @Test
-    public void testCommentAuthorship() throws Exception {
-        XMLResult r = getXML("testPPT_comment.ppt");
-        assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
-    }
-
-    @Test
-    public void testEmbeddedPDF() throws Exception {
-        List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.ppt");
-        assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
-        assertEquals("3.pdf", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
-        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
-        assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
-    }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PowerPointParserTest extends TikaTest {
+
+    @Test
+    public void testPowerPointParser() throws Exception {
+        try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT.ppt")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(
+                    "application/vnd.ms-powerpoint",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Sample Powerpoint Slide", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Keith Bennett", metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+            String content = handler.toString();
+            assertContains("Sample Powerpoint Slide", content);
+            assertContains("Powerpoint X for Mac", content);
+        }
+    }
+
+    @Test
+    public void testVarious() throws Exception {
+        Metadata metadata = new Metadata();
+        String xml = getXML("testPPT_various.ppt", metadata).xml;
+        assertContains("<p>Footnote appears here", xml);
+        assertContains("<p>[1] This is a footnote.", xml);
+        assertContains("<p>This is the header text.</p>", xml);
+        assertContains("<p>This is the footer text.</p>", xml);
+        assertContains("<p>Here is a text box</p>", xml);
+        assertContains("<p>Bold ", xml);
+        assertContains("italic underline superscript subscript", xml);
+        assertContains("underline", xml);
+        assertContains("superscript", xml);
+        assertContains("subscript", xml);
+        assertContains("<p>Here is a citation:", xml);
+        assertContains("Figure 1 This is a caption for Figure 1", xml);
+        assertContains("(Kramer)", xml);
+        assertContains("<table><tr>\t<td>Row 1 Col 1</td>", xml);
+        assertContains("<td>Row 2 Col 2</td>\t<td>Row 2 Col 3</td></tr>", xml);
+        assertContains("<p>Row 1 column 1</p>", xml);
+        assertContains("<p>Row 2 column 2</p>", xml);
+        assertContains("<p><a href=\"http://tika.apache.org/\">This is a hyperlink</a>", xml);
+        assertContains("<p>Here is a list:", xml);
+        for(int row=1;row<=3;row++) {
+            //assertContains("�\tBullet " + row, content);
+            //assertContains("\u00b7\tBullet " + row, content);
+            assertContains("<li>Bullet " + row, xml);
+        }
+        assertContains("Here is a numbered list:", xml);
+        for(int row=1;row<=3;row++) {
+            //assertContains(row + ")\tNumber bullet " + row, content);
+            //assertContains(row + ") Number bullet " + row, content);
+            // TODO: OOXMLExtractor fails to number the bullets:
+            assertContains("<li>Number bullet " + row, xml);
+        }
+
+        for(int row=1;row<=2;row++) {
+            for(int col=1;col<=3;col++) {
+                assertContains("Row " + row + " Col " + col, xml);
+            }
+        }
+        assertContains("Keyword1 Keyword2", xml);
+        assertEquals("Keyword1 Keyword2",
+                     metadata.get(TikaCoreProperties.KEYWORDS));
+
+        assertContains("Subject is here", xml);
+        assertEquals("Subject is here",
+                     metadata.get(OfficeOpenXMLCore.SUBJECT));
+        // TODO: Remove subject in Tika 2.0
+        assertEquals("Subject is here",
+                     metadata.get(Metadata.SUBJECT));
+
+        assertContains("Suddenly some Japanese text:", xml);
+        // Special version of (GHQ)
+        assertContains("\uff08\uff27\uff28\uff31\uff09", xml);
+        // 6 other characters
+        assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f",
+                xml);
+
+        assertContains("And then some Gothic text:", xml);
+        assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A",
+                xml);
+    }
+
+    @Test
+    public void testMasterFooter() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_masterFooter.ppt")) {
+            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+        }
+
+        String content = handler.toString();
+        assertContains("Master footer is here", content);
+
+        // Make sure boilerplate text didn't come through:
+        assertEquals(-1, content.indexOf("Click to edit Master"));
+
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
+    }
+
+    /**
+     * TIKA-712 Master Slide Text from PPT and PPTX files
+     *  should be extracted too
+     */
+    @Test
+    public void testMasterText() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_masterText.ppt")) {
+            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+        }
+
+        String content = handler.toString();
+        assertContains("Text that I added to the master slide", content);
+
+        // Make sure boilerplate text didn't come through:
+        assertEquals(-1, content.indexOf("Click to edit Master"));
+
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
+    }
+
+    @Test
+    public void testMasterText2() throws Exception {
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_masterText2.ppt")) {
+            new OfficeParser().parse(stream, handler, metadata, new ParseContext());
+        }
+
+        String content = handler.toString();
+        assertContains("Text that I added to the master slide", content);
+
+        // Make sure boilerplate text didn't come through:
+        assertEquals(-1, content.indexOf("Click to edit Master"));
+        //TIKA-1171
+        assertEquals(-1, content.indexOf("*"));
+    }
+
+    /**
+     * Ensures that custom OLE2 (HPSF) properties are extracted
+     */
+    @Test
+    public void testCustomProperties() throws Exception {
+        Metadata metadata = new Metadata();
+
+        try (InputStream input = PowerPointParserTest.class.getResourceAsStream(
+                "/test-documents/testPPT_custom_props.ppt")) {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            new OfficeParser().parse(input, handler, metadata, context);
+        }
+
+        assertEquals("application/vnd.ms-powerpoint", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
+        assertEquals("2011-08-22T13:32:58Z", metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2011-08-22T13:32:58Z", metadata.get(Metadata.DATE));
+        assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
+        assertEquals("1", metadata.get(Office.SLIDE_COUNT));
+        assertEquals("3", metadata.get(Office.WORD_COUNT));
+        assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("true", metadata.get("custom:myCustomBoolean"));
+        assertEquals("3", metadata.get("custom:myCustomNumber"));
+        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+        assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
+        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+    }
+
+    // TIKA-1025
+    @Test
+    public void testEmbeddedPlacedholder() throws Exception {
+        XMLResult result = getXML("testPPT_embedded2.ppt");
+        assertContains("<div class=\"embedded\" id=\"1\" />", result.xml);
+        assertContains("<div class=\"embedded\" id=\"14\" />", result.xml);
+    }
+
+    // TIKA-817
+    @Test
+    public void testAutoDatePPT() throws Exception {
+        //decision was made in POI-52367 not to generate
+        //autodate automatically.  For pptx, where value is stored,
+        //value is extracted.  For ppt, however, no date is extracted.
+        XMLResult result = getXML("testPPT_autodate.ppt");
+        assertContains(
+                "<div class=\"slide-content\"><p>Now</p>",
+                result.xml);
+    }
+
+    @Test
+    public void testCommentAuthorship() throws Exception {
+        XMLResult r = getXML("testPPT_comment.ppt");
+        assertContains("<p class=\"slide-comment\"><b>Allison, Timothy B. (ATB)", r.xml);
+    }
+
+    @Test
+    public void testEmbeddedPDF() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testPPT_embeddedPDF.ppt");
+        assertEquals("application/pdf", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals("3.pdf", metadataList.get(1).get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+        assertEquals("4.pdf", metadataList.get(2).get(Metadata.RESOURCE_NAME_KEY));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
index a3ccefc..a37e44d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/PublisherParserTest.java
@@ -1,53 +1,53 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class PublisherParserTest {
-
-    @Test
-    public void testPublisherParser() throws Exception {
-        try (InputStream input = PublisherParserTest.class.getResourceAsStream(
-                "/test-documents/testPUBLISHER.pub")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
-            assertEquals(
-                    "application/x-mspublisher",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
-            assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
-            String content = handler.toString();
-            assertContains("0123456789", content);
-            assertContains("abcdef", content);
-        }
-    }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class PublisherParserTest {
+
+    @Test
+    public void testPublisherParser() throws Exception {
+        try (InputStream input = PublisherParserTest.class.getResourceAsStream(
+                "/test-documents/testPUBLISHER.pub")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(
+                    "application/x-mspublisher",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals(null, metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Nick Burch", metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Nick Burch", metadata.get(Metadata.AUTHOR));
+            String content = handler.toString();
+            assertContains("0123456789", content);
+            assertContains("abcdef", content);
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
index 4edb5ee..8062555 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/TNEFParserTest.java
@@ -1,98 +1,98 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-
-import org.apache.tika.TikaTest.TrackingHandler;
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.extractor.ContainerExtractor;
-import org.apache.tika.extractor.ParserContainerExtractor;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Tests for the TNEF (winmail.dat) parser
- */
-public class TNEFParserTest extends AbstractPOIContainerExtractionTest {
-    private static final String file = "testWINMAIL.dat";
-
-    @Test
-    public void testBasics() throws Exception {
-        Detector detector = new DefaultDetector();
-        try (TikaInputStream stream = getTestFile(file)) {
-            assertEquals(
-                    MediaType.application("vnd.ms-tnef"),
-                    detector.detect(stream, new Metadata()));
-        }
-    }
-
-    @Test
-    public void testMetadata() throws Exception {
-        TikaInputStream stream = getTestFile(file);
-
-        Metadata metadata = new Metadata();
-        ContentHandler handler = new BodyContentHandler();
-
-        TNEFParser tnef = new TNEFParser();
-        tnef.parse(stream, handler, metadata, new ParseContext());
-
-        assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
-    }
-
-    /**
-     * Check the Rtf and Attachments are returned
-     * as expected
-     */
-    @Test
-    public void testBodyAndAttachments() throws Exception {
-        ContainerExtractor extractor = new ParserContainerExtractor();
-
-        // Process it with recursing
-        // Will have the message body RTF and the attachments
-        TrackingHandler handler = process(file, extractor, true);
-        assertEquals(6, handler.filenames.size());
-        assertEquals(6, handler.mediaTypes.size());
-
-        // We know the filenames for all of them
-        assertEquals("message.rtf", handler.filenames.get(0));
-        assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
-
-        assertEquals("quick.doc", handler.filenames.get(1));
-        assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1));
-
-        assertEquals("quick.html", handler.filenames.get(2));
-        assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
-
-        assertEquals("quick.pdf", handler.filenames.get(3));
-        assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
-
-        assertEquals("quick.txt", handler.filenames.get(4));
-        assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
-
-        assertEquals("quick.xml", handler.filenames.get(5));
-        assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.TikaTest.TrackingHandler;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Tests for the TNEF (winmail.dat) parser
+ */
+public class TNEFParserTest extends AbstractPOIContainerExtractionTest {
+    private static final String file = "testWINMAIL.dat";
+
+    @Test
+    public void testBasics() throws Exception {
+        Detector detector = new DefaultDetector();
+        try (TikaInputStream stream = getTestFile(file)) {
+            assertEquals(
+                    MediaType.application("vnd.ms-tnef"),
+                    detector.detect(stream, new Metadata()));
+        }
+    }
+
+    @Test
+    public void testMetadata() throws Exception {
+        TikaInputStream stream = getTestFile(file);
+
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+
+        TNEFParser tnef = new TNEFParser();
+        tnef.parse(stream, handler, metadata, new ParseContext());
+
+        assertEquals("This is a test message", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("This is a test message", metadata.get(Metadata.SUBJECT));
+    }
+
+    /**
+     * Check the Rtf and Attachments are returned
+     * as expected
+     */
+    @Test
+    public void testBodyAndAttachments() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+
+        // Process it with recursing
+        // Will have the message body RTF and the attachments
+        TrackingHandler handler = process(file, extractor, true);
+        assertEquals(6, handler.filenames.size());
+        assertEquals(6, handler.mediaTypes.size());
+
+        // We know the filenames for all of them
+        assertEquals("message.rtf", handler.filenames.get(0));
+        assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
+
+        assertEquals("quick.doc", handler.filenames.get(1));
+        assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1));
+
+        assertEquals("quick.html", handler.filenames.get(2));
+        assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
+
+        assertEquals("quick.pdf", handler.filenames.get(3));
+        assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
+
+        assertEquals("quick.txt", handler.filenames.get(4));
+        assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
+
+        assertEquals("quick.xml", handler.filenames.get(5));
+        assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
index 3002187..06320fe 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/VisioParserTest.java
@@ -1,51 +1,51 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.apache.tika.TikaTest.assertContains;
-import static org.junit.Assert.assertEquals;
-
-import java.io.InputStream;
-
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class VisioParserTest {
-
-    @Test
-    public void testVisioParser() throws Exception {
-        try (InputStream input = VisioParserTest.class.getResourceAsStream(
-                "/test-documents/testVISIO.vsd")) {
-            Metadata metadata = new Metadata();
-            ContentHandler handler = new BodyContentHandler();
-            new OfficeParser().parse(input, handler, metadata, new ParseContext());
-
-            assertEquals(
-                    "application/vnd.visio",
-                    metadata.get(Metadata.CONTENT_TYPE));
-            assertEquals("", metadata.get(TikaCoreProperties.TITLE));
-            assertEquals("Hogwarts", metadata.get(TikaCoreProperties.CREATOR));
-            String content = handler.toString();
-            assertContains("Some random text, on a page", content);
-        }
-    }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class VisioParserTest {
+
+    @Test
+    public void testVisioParser() throws Exception {
+        try (InputStream input = VisioParserTest.class.getResourceAsStream(
+                "/test-documents/testVISIO.vsd")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(
+                    "application/vnd.visio",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Hogwarts", metadata.get(TikaCoreProperties.CREATOR));
+            String content = handler.toString();
+            assertContains("Some random text, on a page", content);
+        }
+    }
+
+}