You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:23 UTC

[17/39] tika git commit: Convert new lines from windows to unix

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 196ffa9..4ea3fa1 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -1,412 +1,412 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.io.InputStream;
-import java.util.List;
-import java.util.Locale;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class ExcelParserTest extends TikaTest {
-    @Test
-    @SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata keys
-    public void testExcelParser() throws Exception {
-
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.US);
-        XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), new Metadata(), context);
-
-        assertEquals(
-                "application/vnd.ms-excel",
-                r.metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Simple Excel document", r.metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Keith Bennett", r.metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Keith Bennett", r.metadata.get(Metadata.AUTHOR));
-
-        // Mon Oct 01 17:13:56 BST 2007
-        assertEquals("2007-10-01T16:13:56Z", r.metadata.get(TikaCoreProperties.CREATED));
-        assertEquals("2007-10-01T16:13:56Z", r.metadata.get(Metadata.CREATION_DATE));
-
-        // Mon Oct 01 17:31:43 BST 2007
-        assertEquals("2007-10-01T16:31:43Z", r.metadata.get(TikaCoreProperties.MODIFIED));
-        assertEquals("2007-10-01T16:31:43Z", r.metadata.get(Metadata.DATE));
-
-        String content = r.xml;
-        assertContains("Sample Excel Worksheet", content);
-        assertContains("Numbers and their Squares", content);
-        assertContains("<tr>\t<td />\t<td>Number</td>\t<td>Square", content);
-        assertContains("9", content);
-        assertNotContained("9.0", content);
-        assertContains("196", content);
-        assertNotContained("196.0", content);
-
-    }
-
-    @Test
-    public void testExcelParserFormatting() throws Exception {
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.US);
-        XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), new Metadata(), context);
-
-        assertEquals(
-                "application/vnd.ms-excel",
-                r.metadata.get(Metadata.CONTENT_TYPE));
-
-        String content = r.xml;
-
-        // Number #,##0.00
-        assertContains("1,599.99", content);
-        assertContains("-1,599.99", content);
-
-        // Currency $#,##0.00;[Red]($#,##0.00)
-        assertContains("$1,599.99", content);
-        assertContains("($1,599.99)", content);
-
-        // Scientific 0.00E+00
-        // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
-        assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
-        assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
-
-        // Percentage.
-        assertContains("2.50%", content);
-        // Excel rounds up to 3%, but that requires Java 1.6 or later
-        if (System.getProperty("java.version").startsWith("1.5")) {
-            assertContains("2%", content);
-        } else {
-            assertContains("3%", content);
-        }
-
-        // Time Format: h:mm
-        assertContains("6:15", content);
-        assertContains("18:15", content);
-
-        // Date Format: d-mmm-yy
-        assertContains("17-May-07", content);
-
-        // Date Format: m/d/yy
-        assertContains("10/3/09", content);
-
-        // Date/Time Format: m/d/yy h:mm
-        assertContains("1/19/08 4:35", content);
-
-        // Fraction (2.5): # ?/?
-        assertContains("2 1/2", content);
-
-
-        // Below assertions represent outstanding formatting issues to be addressed
-        // they are included to allow the issues to be progressed with the Apache POI
-        // team - See TIKA-103.
-
-        /*************************************************************************
-         // Custom Number (0 "dollars and" .00 "cents")
-         assertContains("19 dollars and .99 cents", content);
-
-         // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
-         assertContains("At 4:20 AM on Thursday May 17, 2007", content);
-         **************************************************************************/
-
-
-    }
-
-    @Test
-    public void testExcelParserPassword() throws Exception {
-        try {
-            XMLResult r = getXML("testEXCEL_protected_passtika.xls");
-            fail("Document is encrypted, shouldn't parse");
-        } catch (EncryptedDocumentException e) {
-            // Good
-        }
-
-        // Try again, this time with the password
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.US);
-        context.set(PasswordProvider.class, new PasswordProvider() {
-            @Override
-            public String getPassword(Metadata metadata) {
-                return "tika";
-            }
-        });
-        XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), new Metadata(), context);
-
-        assertEquals(
-                "application/vnd.ms-excel",
-                r.metadata.get(Metadata.CONTENT_TYPE));
-
-        assertEquals(null, r.metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Antoni", r.metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("2011-11-25T09:52:48Z", r.metadata.get(TikaCoreProperties.CREATED));
-
-        String content = r.xml;
-        assertContains("This is an Encrypted Excel spreadsheet", content);
-        assertNotContained("9.0", content);
-
-    }
-
-    /**
-     * TIKA-214 - Ensure we extract labels etc from Charts
-     */
-    @Test
-    public void testExcelParserCharts() throws Exception {
-
-        XMLResult r = getXML("testEXCEL-charts.xls", new OfficeParser());
-        assertEquals(
-                "application/vnd.ms-excel",
-                r.metadata.get(Metadata.CONTENT_TYPE));
-
-        String content = r.xml;
-
-        // The first sheet has a pie chart
-        assertContains("charttabyodawg", content);
-        assertContains("WhamPuff", content);
-
-        // The second sheet has a bar chart and some text
-        assertContains("Sheet1", content);
-        assertContains("Test Excel Spreasheet", content);
-        assertContains("foo", content);
-        assertContains("bar", content);
-        assertContains("fizzlepuff", content);
-        assertContains("whyaxis", content);
-        assertContains("eksaxis", content);
-
-        // The third sheet has some text
-        assertContains("Sheet2", content);
-        assertContains("dingdong", content);
-
-    }
-
-    @Test
-    public void testJXL() throws Exception {
-
-        XMLResult r = getXML("jxl.xls", new OfficeParser());
-        assertEquals(
-                "application/vnd.ms-excel",
-                r.metadata.get(Metadata.CONTENT_TYPE));
-        assertContains("Number Formats", r.xml);
-
-    }
-
-    @Test
-    public void testWorksSpreadsheet70() throws Exception {
-        assertContains("Microsoft Works",
-                getXML("testWORKSSpreadsheet7.0.xlr", new OfficeParser()).xml);
-    }
-
-    /**
-     * We don't currently support the .xlsb file format 
-     *  (an OOXML container with binary blobs), but we 
-     *  shouldn't break on these files either (TIKA-826)  
-     */
-    @Test
-    public void testExcelXLSB() throws Exception {
-        Detector detector = new DefaultDetector();
-        AutoDetectParser parser = new AutoDetectParser();
-
-        Metadata m = new Metadata();
-        m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
-
-        // Should be detected correctly
-        MediaType type;
-        try (InputStream input = getTestDocumentAsStream("testEXCEL.xlsb")) {
-            type = detector.detect(input, m);
-            assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
-        }
-
-        // OfficeParser won't handle it
-        assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-        // OOXMLParser won't handle it
-        assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-        // AutoDetectParser doesn't break on it
-        assertContains("<body />", getXML("testEXCEL.xlsb").xml);
-
-    }
-
-    /**
-     * Excel 5 and 95 are older formats, and only get basic support
-     */
-    @Test
-    public void testExcel95() throws Exception {
-        Detector detector = new DefaultDetector();
-        AutoDetectParser parser = new AutoDetectParser();
-        MediaType type;
-        Metadata m;
-
-        // First try detection of Excel 5
-        m = new Metadata();
-        m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
-        try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
-            type = detector.detect(input, m);
-            assertEquals("application/vnd.ms-excel", type.toString());
-        }
-
-        // Now Excel 95
-        m = new Metadata();
-        m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
-        try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
-            type = detector.detect(input, m);
-            assertEquals("application/vnd.ms-excel", type.toString());
-        }
-
-        // OfficeParser can handle it
-        assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-        // OOXMLParser won't handle it
-        assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-
-        // Parse the Excel 5 file
-        m = new Metadata();
-        try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
-            ContentHandler handler = new BodyContentHandler(-1);
-            ParseContext context = new ParseContext();
-            context.set(Locale.class, Locale.US);
-            parser.parse(input, handler, m, context);
-
-            String content = handler.toString();
-
-            // Sheet names
-            assertContains("Feuil1", content);
-            assertContains("Feuil3", content);
-
-            // Text
-            assertContains("Sample Excel", content);
-            assertContains("Number", content);
-
-            // Numbers
-            assertContains("15", content);
-            assertContains("225", content);
-
-            // Metadata was also fetched
-            assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
-            assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
-        }
-
-        // Parse the Excel 95 file
-        m = new Metadata();
-        try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
-            ContentHandler handler = new BodyContentHandler(-1);
-            ParseContext context = new ParseContext();
-            context.set(Locale.class, Locale.US);
-            parser.parse(input, handler, m, context);
-
-            String content = handler.toString();
-
-            // Sheet name
-            assertContains("Foglio1", content);
-
-            // Very boring file, no actual text or numbers!
-
-            // Metadata was also fetched
-            assertEquals(null, m.get(TikaCoreProperties.TITLE));
-            assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
-        }
-    }
-
-    /**
-     * Ensures that custom OLE2 (HPSF) properties are extracted
-     */
-    @Test
-    public void testCustomProperties() throws Exception {
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.US);
-
-        XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), new Metadata(), context);
-        Metadata metadata = r.metadata;
-        assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
-        assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
-        assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
-        assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
-        assertEquals("true", metadata.get("custom:myCustomBoolean"));
-        assertEquals("3", metadata.get("custom:myCustomNumber"));
-        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
-        assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
-        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
-    }
-
-	@Test
-    public void testHeaderAndFooterExtraction() throws Exception {
-        ParseContext context = new ParseContext();
-        context.set(Locale.class, Locale.UK);
-
-        XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(),
-                new Metadata(), context);
-
-        Metadata metadata = r.metadata;
-        assertEquals(
-                "application/vnd.ms-excel",
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
-        assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
-
-        String content = r.xml;
-        assertContains("John Smith1", content);
-        assertContains("John Smith50", content);
-        assertContains("1 Corporate HQ", content);
-        assertContains("Header - Corporate Spreadsheet", content);
-        assertContains("Header - For Internal Use Only", content);
-        assertContains("Header - Author: John Smith", content);
-        assertContains("Footer - Corporate Spreadsheet", content);
-        assertContains("Footer - For Internal Use Only", content);
-        assertContains("Footer - Author: John Smith", content);
-
-    }
-
-    @Test
-    public void testHyperlinksInXLS() throws Exception {
-        String xml = getXML("testEXCEL_hyperlinks.xls").xml;
-        //external url
-        assertContains("<a href=\"http://tika.apache.org/\">", xml);
-        //mail url
-        assertContains("<a href=\"mailto:user@tika.apache.org?subject=help\">", xml);
-        //external linked file
-        assertContains("<a href=\"linked_file.txt.htm\">", xml);
-
-        //TODO: not extracting these yet
-        //link on textbox
-//        assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
-    }
-
-    @Test
-    public void testEmbeddedPDF() throws Exception {
-        List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
-        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
-    }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class ExcelParserTest extends TikaTest {
+    @Test
+    @SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata keys
+    public void testExcelParser() throws Exception {
+
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+        XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), new Metadata(), context);
+
+        assertEquals(
+                "application/vnd.ms-excel",
+                r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Simple Excel document", r.metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Keith Bennett", r.metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Keith Bennett", r.metadata.get(Metadata.AUTHOR));
+
+        // Mon Oct 01 17:13:56 BST 2007
+        assertEquals("2007-10-01T16:13:56Z", r.metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("2007-10-01T16:13:56Z", r.metadata.get(Metadata.CREATION_DATE));
+
+        // Mon Oct 01 17:31:43 BST 2007
+        assertEquals("2007-10-01T16:31:43Z", r.metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2007-10-01T16:31:43Z", r.metadata.get(Metadata.DATE));
+
+        String content = r.xml;
+        assertContains("Sample Excel Worksheet", content);
+        assertContains("Numbers and their Squares", content);
+        assertContains("<tr>\t<td />\t<td>Number</td>\t<td>Square", content);
+        assertContains("9", content);
+        assertNotContained("9.0", content);
+        assertContains("196", content);
+        assertNotContained("196.0", content);
+
+    }
+
+    @Test
+    public void testExcelParserFormatting() throws Exception {
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+        XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), new Metadata(), context);
+
+        assertEquals(
+                "application/vnd.ms-excel",
+                r.metadata.get(Metadata.CONTENT_TYPE));
+
+        String content = r.xml;
+
+        // Number #,##0.00
+        assertContains("1,599.99", content);
+        assertContains("-1,599.99", content);
+
+        // Currency $#,##0.00;[Red]($#,##0.00)
+        assertContains("$1,599.99", content);
+        assertContains("($1,599.99)", content);
+
+        // Scientific 0.00E+00
+        // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
+        assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
+        assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
+
+        // Percentage.
+        assertContains("2.50%", content);
+        // Excel rounds up to 3%, but that requires Java 1.6 or later
+        if (System.getProperty("java.version").startsWith("1.5")) {
+            assertContains("2%", content);
+        } else {
+            assertContains("3%", content);
+        }
+
+        // Time Format: h:mm
+        assertContains("6:15", content);
+        assertContains("18:15", content);
+
+        // Date Format: d-mmm-yy
+        assertContains("17-May-07", content);
+
+        // Date Format: m/d/yy
+        assertContains("10/3/09", content);
+
+        // Date/Time Format: m/d/yy h:mm
+        assertContains("1/19/08 4:35", content);
+
+        // Fraction (2.5): # ?/?
+        assertContains("2 1/2", content);
+
+
+        // Below assertions represent outstanding formatting issues to be addressed
+        // they are included to allow the issues to be progressed with the Apache POI
+        // team - See TIKA-103.
+
+        /*************************************************************************
+         // Custom Number (0 "dollars and" .00 "cents")
+         assertContains("19 dollars and .99 cents", content);
+
+         // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+         assertContains("At 4:20 AM on Thursday May 17, 2007", content);
+         **************************************************************************/
+
+
+    }
+
+    @Test
+    public void testExcelParserPassword() throws Exception {
+        try {
+            XMLResult r = getXML("testEXCEL_protected_passtika.xls");
+            fail("Document is encrypted, shouldn't parse");
+        } catch (EncryptedDocumentException e) {
+            // Good
+        }
+
+        // Try again, this time with the password
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+        context.set(PasswordProvider.class, new PasswordProvider() {
+            @Override
+            public String getPassword(Metadata metadata) {
+                return "tika";
+            }
+        });
+        XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), new Metadata(), context);
+
+        assertEquals(
+                "application/vnd.ms-excel",
+                r.metadata.get(Metadata.CONTENT_TYPE));
+
+        assertEquals(null, r.metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Antoni", r.metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("2011-11-25T09:52:48Z", r.metadata.get(TikaCoreProperties.CREATED));
+
+        String content = r.xml;
+        assertContains("This is an Encrypted Excel spreadsheet", content);
+        assertNotContained("9.0", content);
+
+    }
+
+    /**
+     * TIKA-214 - Ensure we extract labels etc from Charts
+     */
+    @Test
+    public void testExcelParserCharts() throws Exception {
+
+        XMLResult r = getXML("testEXCEL-charts.xls", new OfficeParser());
+        assertEquals(
+                "application/vnd.ms-excel",
+                r.metadata.get(Metadata.CONTENT_TYPE));
+
+        String content = r.xml;
+
+        // The first sheet has a pie chart
+        assertContains("charttabyodawg", content);
+        assertContains("WhamPuff", content);
+
+        // The second sheet has a bar chart and some text
+        assertContains("Sheet1", content);
+        assertContains("Test Excel Spreasheet", content);
+        assertContains("foo", content);
+        assertContains("bar", content);
+        assertContains("fizzlepuff", content);
+        assertContains("whyaxis", content);
+        assertContains("eksaxis", content);
+
+        // The third sheet has some text
+        assertContains("Sheet2", content);
+        assertContains("dingdong", content);
+
+    }
+
+    @Test
+    public void testJXL() throws Exception {
+
+        XMLResult r = getXML("jxl.xls", new OfficeParser());
+        assertEquals(
+                "application/vnd.ms-excel",
+                r.metadata.get(Metadata.CONTENT_TYPE));
+        assertContains("Number Formats", r.xml);
+
+    }
+
+    @Test
+    public void testWorksSpreadsheet70() throws Exception {
+        assertContains("Microsoft Works",
+                getXML("testWORKSSpreadsheet7.0.xlr", new OfficeParser()).xml);
+    }
+
+    /**
+     * We don't currently support the .xlsb file format
+     * (an OOXML container with binary blobs), but we
+     * shouldn't break on these files either (TIKA-826)
+     */
+    @Test
+    public void testExcelXLSB() throws Exception {
+        Detector detector = new DefaultDetector();
+        AutoDetectParser parser = new AutoDetectParser();
+
+        Metadata m = new Metadata();
+        m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
+
+        // Should be detected correctly
+        MediaType type;
+        try (InputStream input = getTestDocumentAsStream("testEXCEL.xlsb")) {
+            type = detector.detect(input, m);
+            assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
+        }
+
+        // OfficeParser won't handle it
+        assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+        // OOXMLParser won't handle it
+        assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+        // AutoDetectParser doesn't break on it
+        assertContains("<body />", getXML("testEXCEL.xlsb").xml);
+
+    }
+
+    /**
+     * Excel 5 and 95 are older formats, and only get basic support
+     */
+    @Test
+    public void testExcel95() throws Exception {
+        Detector detector = new DefaultDetector();
+        AutoDetectParser parser = new AutoDetectParser();
+        MediaType type;
+        Metadata m;
+
+        // First try detection of Excel 5
+        m = new Metadata();
+        m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
+        try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
+            type = detector.detect(input, m);
+            assertEquals("application/vnd.ms-excel", type.toString());
+        }
+
+        // Now Excel 95
+        m = new Metadata();
+        m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
+        try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
+            type = detector.detect(input, m);
+            assertEquals("application/vnd.ms-excel", type.toString());
+        }
+
+        // OfficeParser can handle it
+        assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+        // OOXMLParser won't handle it
+        assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+
+        // Parse the Excel 5 file
+        m = new Metadata();
+        try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            parser.parse(input, handler, m, context);
+
+            String content = handler.toString();
+
+            // Sheet names
+            assertContains("Feuil1", content);
+            assertContains("Feuil3", content);
+
+            // Text
+            assertContains("Sample Excel", content);
+            assertContains("Number", content);
+
+            // Numbers
+            assertContains("15", content);
+            assertContains("225", content);
+
+            // Metadata was also fetched
+            assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
+            assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
+        }
+
+        // Parse the Excel 95 file
+        m = new Metadata();
+        try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
+            ContentHandler handler = new BodyContentHandler(-1);
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            parser.parse(input, handler, m, context);
+
+            String content = handler.toString();
+
+            // Sheet name
+            assertContains("Foglio1", content);
+
+            // Very boring file, no actual text or numbers!
+
+            // Metadata was also fetched
+            assertEquals(null, m.get(TikaCoreProperties.TITLE));
+            assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
+        }
+    }
+
+    /**
+     * Ensures that custom OLE2 (HPSF) properties are extracted
+     */
+    @Test
+    public void testCustomProperties() throws Exception {
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.US);
+
+        XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), new Metadata(), context);
+        Metadata metadata = r.metadata;
+        assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
+        assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
+        assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
+        assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+        assertEquals("true", metadata.get("custom:myCustomBoolean"));
+        assertEquals("3", metadata.get("custom:myCustomNumber"));
+        assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+        assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
+        assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+    }
+
+    @Test
+    public void testHeaderAndFooterExtraction() throws Exception {
+        ParseContext context = new ParseContext();
+        context.set(Locale.class, Locale.UK);
+
+        XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(),
+                new Metadata(), context);
+
+        Metadata metadata = r.metadata;
+        assertEquals(
+                "application/vnd.ms-excel",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
+
+        String content = r.xml;
+        assertContains("John Smith1", content);
+        assertContains("John Smith50", content);
+        assertContains("1 Corporate HQ", content);
+        assertContains("Header - Corporate Spreadsheet", content);
+        assertContains("Header - For Internal Use Only", content);
+        assertContains("Header - Author: John Smith", content);
+        assertContains("Footer - Corporate Spreadsheet", content);
+        assertContains("Footer - For Internal Use Only", content);
+        assertContains("Footer - Author: John Smith", content);
+
+    }
+
+    @Test
+    public void testHyperlinksInXLS() throws Exception {
+        String xml = getXML("testEXCEL_hyperlinks.xls").xml;
+        //external url
+        assertContains("<a href=\"http://tika.apache.org/\">", xml);
+        //mail url
+        assertContains("<a href=\"mailto:user@tika.apache.org?subject=help\">", xml);
+        //external linked file
+        assertContains("<a href=\"linked_file.txt.htm\">", xml);
+
+        //TODO: not extracting these yet
+        //link on textbox
+//        assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
+    }
+
+    @Test
+    public void testEmbeddedPDF() throws Exception {
+        List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+        assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
index 07644dd..beffee6 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
@@ -1,46 +1,46 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
-import org.junit.Test;
-
-
-public class OfficeParserTest extends TikaTest {
-
-    @Test
-    public void parseOfficeWord() throws Exception {
-        Metadata metadata = new Metadata();
-        Parser parser = new OfficeParser();
-
-        String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml;
-
-        assertTrue(xml.contains("test"));
-    }
-
-    private InputStream getTestDocument(String name) {
-        return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name));
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
+import org.junit.Test;
+
+
+public class OfficeParserTest extends TikaTest {
+
+    @Test
+    public void parseOfficeWord() throws Exception {
+        Metadata metadata = new Metadata();
+        Parser parser = new OfficeParser();
+
+        String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml;
+
+        assertTrue(xml.contains("test"));
+    }
+
+    private InputStream getTestDocument(String name) {
+        return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index fbf8114..8662e65 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -1,239 +1,239 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing Outlook files.
- */
-public class OutlookParserTest extends TikaTest {
-
-    @Test
-    public void testOutlookParsing() throws Exception {
-        Parser parser = new AutoDetectParser(); // Should auto-detect!
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/test-outlook.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        assertEquals(
-                "application/vnd.ms-outlook",
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(
-                "Microsoft Outlook Express 6",
-                metadata.get(TikaCoreProperties.TITLE));
-        assertEquals(
-                "Nouvel utilisateur de Outlook Express",
-                metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
-        assertEquals(
-                "L'\u00C9quipe Microsoft Outlook Express",
-                metadata.get(TikaCoreProperties.CREATOR));
-        assertEquals(
-                "L'\u00C9quipe Microsoft Outlook Express",
-                metadata.get(Metadata.AUTHOR));
-
-        // Stored as Thu, 5 Apr 2007 09:26:06 -0700
-        assertEquals(
-                "2007-04-05T16:26:06Z",
-                metadata.get(TikaCoreProperties.CREATED));
-
-        String content = handler.toString();
-        assertContains("Microsoft Outlook Express 6", content);
-        assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
-        assertContains("Nouvel utilisateur de Outlook Express", content);
-        assertContains("Messagerie et groupes de discussion", content);
-    }
-
-    /**
-     * Test case for TIKA-197
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
-     */
-    @Test
-    public void testMultipleCopies() throws Exception {
-        Parser parser = new AutoDetectParser();
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/testMSG.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        assertEquals(
-                "application/vnd.ms-outlook",
-                metadata.get(Metadata.CONTENT_TYPE));
-
-        String content = handler.toString();
-        Pattern pattern = Pattern.compile("From");
-        Matcher matcher = pattern.matcher(content);
-        assertTrue(matcher.find());
-        assertFalse(matcher.find());
-    }
-
-    /**
-     * Test case for TIKA-395, to ensure parser works for new Outlook formats.
-     *
-     * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
-     */
-    @Test
-    public void testOutlookNew() throws Exception {
-        Parser parser = new AutoDetectParser();
-        ContentHandler handler = new BodyContentHandler();
-        Metadata metadata = new Metadata();
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/test-outlook2003.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        assertEquals(
-                "application/vnd.ms-outlook",
-                metadata.get(Metadata.CONTENT_TYPE));
-        assertEquals(
-                "Welcome to Microsoft Office Outlook 2003",
-                metadata.get(TikaCoreProperties.TITLE));
-
-        String content = handler.toString();
-        assertContains("Outlook 2003", content);
-        assertContains("Streamlined Mail Experience", content);
-        assertContains("Navigation Pane", content);
-    }
-
-    @Test
-    public void testOutlookHTMLVersion() throws Exception {
-        Parser parser = new AutoDetectParser();
-        Metadata metadata = new Metadata();
-
-        // Check the HTML version
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
-        handler.setResult(new StreamResult(sw));
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/testMSG_chinese.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        // As the HTML version should have been processed, ensure
-        //  we got some of the links
-        String content = sw.toString();
-        assertContains("<dd>tests.chang@fengttt.com</dd>", content);
-        assertContains("<p>Alfresco MSG format testing", content);
-        assertContains("<li>1", content);
-        assertContains("<li>2", content);
-
-        // Make sure we don't have nested html docs
-        assertEquals(2, content.split("<body>").length);
-        assertEquals(2, content.split("<\\/body>").length);
-
-        // Make sure that the Chinese actually came through
-        assertContains("\u5F35\u6BD3\u502B", metadata.get(TikaCoreProperties.CREATOR));
-        assertContains("\u9673\u60E0\u73CD", content);
-    }
-
-    @Test
-    public void testOutlookForwarded() throws Exception {
-        Parser parser = new AutoDetectParser();
-        Metadata metadata = new Metadata();
-
-        // Check the HTML version
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
-        handler.setResult(new StreamResult(sw));
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/testMSG_forwarded.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        // Make sure we don't have nested docs
-        String content = sw.toString();
-        assertEquals(2, content.split("<body>").length);
-        assertEquals(2, content.split("<\\/body>").length);
-    }
-
-    @Test
-    public void testOutlookHTMLfromRTF() throws Exception {
-        Parser parser = new AutoDetectParser();
-        Metadata metadata = new Metadata();
-
-        // Check the HTML version
-        StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory)
-                SAXTransformerFactory.newInstance();
-        TransformerHandler handler = factory.newTransformerHandler();
-        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
-        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
-        handler.setResult(new StreamResult(sw));
-
-        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
-                "/test-documents/test-outlook2003.msg")) {
-            parser.parse(stream, handler, metadata, new ParseContext());
-        }
-
-        // As the HTML version should have been processed, ensure
-        //  we got some of the links
-        String content = sw.toString().replaceAll("<p>\\s+", "<p>");
-        assertContains("<dd>New Outlook User</dd>", content);
-        assertContains("designed <i>to help you", content);
-        assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
-
-        // Link - check text around it, and the link itself
-        assertContains("sign up for a free subscription", content);
-        assertContains("Office Newsletter", content);
-        assertContains("newsletter will be sent to you", content);
-        assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);
-
-        // Make sure we don't have nested html docs
-        assertEquals(2, content.split("<body>").length);
-        assertEquals(2, content.split("<\\/body>").length);
-    }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing Outlook files.
+ */
+public class OutlookParserTest extends TikaTest {
+
+    @Test
+    public void testOutlookParsing() throws Exception {
+        Parser parser = new AutoDetectParser(); // Should auto-detect!
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(
+                "Microsoft Outlook Express 6",
+                metadata.get(TikaCoreProperties.TITLE));
+        assertEquals(
+                "Nouvel utilisateur de Outlook Express",
+                metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
+        assertEquals(
+                "L'\u00C9quipe Microsoft Outlook Express",
+                metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals(
+                "L'\u00C9quipe Microsoft Outlook Express",
+                metadata.get(Metadata.AUTHOR));
+
+        // Stored as Thu, 5 Apr 2007 09:26:06 -0700
+        assertEquals(
+                "2007-04-05T16:26:06Z",
+                metadata.get(TikaCoreProperties.CREATED));
+
+        String content = handler.toString();
+        assertContains("Microsoft Outlook Express 6", content);
+        assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
+        assertContains("Nouvel utilisateur de Outlook Express", content);
+        assertContains("Messagerie et groupes de discussion", content);
+    }
+
+    /**
+     * Test case for TIKA-197
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
+     */
+    @Test
+    public void testMultipleCopies() throws Exception {
+        Parser parser = new AutoDetectParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/testMSG.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+
+        String content = handler.toString();
+        Pattern pattern = Pattern.compile("From");
+        Matcher matcher = pattern.matcher(content);
+        assertTrue(matcher.find());
+        assertFalse(matcher.find());
+    }
+
+    /**
+     * Test case for TIKA-395, to ensure parser works for new Outlook formats.
+     *
+     * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
+     */
+    @Test
+    public void testOutlookNew() throws Exception {
+        Parser parser = new AutoDetectParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook2003.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        assertEquals(
+                "application/vnd.ms-outlook",
+                metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals(
+                "Welcome to Microsoft Office Outlook 2003",
+                metadata.get(TikaCoreProperties.TITLE));
+
+        String content = handler.toString();
+        assertContains("Outlook 2003", content);
+        assertContains("Streamlined Mail Experience", content);
+        assertContains("Navigation Pane", content);
+    }
+
+    @Test
+    public void testOutlookHTMLVersion() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+
+        // Check the HTML version
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/testMSG_chinese.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // As the HTML version should have been processed, ensure
+        //  we got some of the links
+        String content = sw.toString();
+        assertContains("<dd>tests.chang@fengttt.com</dd>", content);
+        assertContains("<p>Alfresco MSG format testing", content);
+        assertContains("<li>1", content);
+        assertContains("<li>2", content);
+
+        // Make sure we don't have nested html docs
+        assertEquals(2, content.split("<body>").length);
+        assertEquals(2, content.split("<\\/body>").length);
+
+        // Make sure that the Chinese actually came through
+        assertContains("\u5F35\u6BD3\u502B", metadata.get(TikaCoreProperties.CREATOR));
+        assertContains("\u9673\u60E0\u73CD", content);
+    }
+
+    @Test
+    public void testOutlookForwarded() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+
+        // Check the HTML version
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/testMSG_forwarded.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // Make sure we don't have nested docs
+        String content = sw.toString();
+        assertEquals(2, content.split("<body>").length);
+        assertEquals(2, content.split("<\\/body>").length);
+    }
+
+    @Test
+    public void testOutlookHTMLfromRTF() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+
+        // Check the HTML version
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook2003.msg")) {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        }
+
+        // As the HTML version should have been processed, ensure
+        //  we got some of the links
+        String content = sw.toString().replaceAll("<p>\\s+", "<p>");
+        assertContains("<dd>New Outlook User</dd>", content);
+        assertContains("designed <i>to help you", content);
+        assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
+
+        // Link - check text around it, and the link itself
+        assertContains("sign up for a free subscription", content);
+        assertContains("Office Newsletter", content);
+        assertContains("newsletter will be sent to you", content);
+        assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);
+
+        // Make sure we don't have nested html docs
+        assertEquals(2, content.split("<body>").length);
+        assertEquals(2, content.split("<\\/body>").length);
+    }
+}