You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:23 UTC
[17/39] tika git commit: Convert new lines from windows to unix
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 196ffa9..4ea3fa1 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -1,412 +1,412 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import java.io.InputStream;
-import java.util.List;
-import java.util.Locale;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.detect.DefaultDetector;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Office;
-import org.apache.tika.metadata.OfficeOpenXMLExtended;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.PasswordProvider;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-public class ExcelParserTest extends TikaTest {
- @Test
- @SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata keys
- public void testExcelParser() throws Exception {
-
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), new Metadata(), context);
-
- assertEquals(
- "application/vnd.ms-excel",
- r.metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Simple Excel document", r.metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Keith Bennett", r.metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Keith Bennett", r.metadata.get(Metadata.AUTHOR));
-
- // Mon Oct 01 17:13:56 BST 2007
- assertEquals("2007-10-01T16:13:56Z", r.metadata.get(TikaCoreProperties.CREATED));
- assertEquals("2007-10-01T16:13:56Z", r.metadata.get(Metadata.CREATION_DATE));
-
- // Mon Oct 01 17:31:43 BST 2007
- assertEquals("2007-10-01T16:31:43Z", r.metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2007-10-01T16:31:43Z", r.metadata.get(Metadata.DATE));
-
- String content = r.xml;
- assertContains("Sample Excel Worksheet", content);
- assertContains("Numbers and their Squares", content);
- assertContains("<tr>\t<td />\t<td>Number</td>\t<td>Square", content);
- assertContains("9", content);
- assertNotContained("9.0", content);
- assertContains("196", content);
- assertNotContained("196.0", content);
-
- }
-
- @Test
- public void testExcelParserFormatting() throws Exception {
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), new Metadata(), context);
-
- assertEquals(
- "application/vnd.ms-excel",
- r.metadata.get(Metadata.CONTENT_TYPE));
-
- String content = r.xml;
-
- // Number #,##0.00
- assertContains("1,599.99", content);
- assertContains("-1,599.99", content);
-
- // Currency $#,##0.00;[Red]($#,##0.00)
- assertContains("$1,599.99", content);
- assertContains("($1,599.99)", content);
-
- // Scientific 0.00E+00
- // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
- assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
- assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
-
- // Percentage.
- assertContains("2.50%", content);
- // Excel rounds up to 3%, but that requires Java 1.6 or later
- if (System.getProperty("java.version").startsWith("1.5")) {
- assertContains("2%", content);
- } else {
- assertContains("3%", content);
- }
-
- // Time Format: h:mm
- assertContains("6:15", content);
- assertContains("18:15", content);
-
- // Date Format: d-mmm-yy
- assertContains("17-May-07", content);
-
- // Date Format: m/d/yy
- assertContains("10/3/09", content);
-
- // Date/Time Format: m/d/yy h:mm
- assertContains("1/19/08 4:35", content);
-
- // Fraction (2.5): # ?/?
- assertContains("2 1/2", content);
-
-
- // Below assertions represent outstanding formatting issues to be addressed
- // they are included to allow the issues to be progressed with the Apache POI
- // team - See TIKA-103.
-
- /*************************************************************************
- // Custom Number (0 "dollars and" .00 "cents")
- assertContains("19 dollars and .99 cents", content);
-
- // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
- assertContains("At 4:20 AM on Thursday May 17, 2007", content);
- **************************************************************************/
-
-
- }
-
- @Test
- public void testExcelParserPassword() throws Exception {
- try {
- XMLResult r = getXML("testEXCEL_protected_passtika.xls");
- fail("Document is encrypted, shouldn't parse");
- } catch (EncryptedDocumentException e) {
- // Good
- }
-
- // Try again, this time with the password
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- context.set(PasswordProvider.class, new PasswordProvider() {
- @Override
- public String getPassword(Metadata metadata) {
- return "tika";
- }
- });
- XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), new Metadata(), context);
-
- assertEquals(
- "application/vnd.ms-excel",
- r.metadata.get(Metadata.CONTENT_TYPE));
-
- assertEquals(null, r.metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Antoni", r.metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("2011-11-25T09:52:48Z", r.metadata.get(TikaCoreProperties.CREATED));
-
- String content = r.xml;
- assertContains("This is an Encrypted Excel spreadsheet", content);
- assertNotContained("9.0", content);
-
- }
-
- /**
- * TIKA-214 - Ensure we extract labels etc from Charts
- */
- @Test
- public void testExcelParserCharts() throws Exception {
-
- XMLResult r = getXML("testEXCEL-charts.xls", new OfficeParser());
- assertEquals(
- "application/vnd.ms-excel",
- r.metadata.get(Metadata.CONTENT_TYPE));
-
- String content = r.xml;
-
- // The first sheet has a pie chart
- assertContains("charttabyodawg", content);
- assertContains("WhamPuff", content);
-
- // The second sheet has a bar chart and some text
- assertContains("Sheet1", content);
- assertContains("Test Excel Spreasheet", content);
- assertContains("foo", content);
- assertContains("bar", content);
- assertContains("fizzlepuff", content);
- assertContains("whyaxis", content);
- assertContains("eksaxis", content);
-
- // The third sheet has some text
- assertContains("Sheet2", content);
- assertContains("dingdong", content);
-
- }
-
- @Test
- public void testJXL() throws Exception {
-
- XMLResult r = getXML("jxl.xls", new OfficeParser());
- assertEquals(
- "application/vnd.ms-excel",
- r.metadata.get(Metadata.CONTENT_TYPE));
- assertContains("Number Formats", r.xml);
-
- }
-
- @Test
- public void testWorksSpreadsheet70() throws Exception {
- assertContains("Microsoft Works",
- getXML("testWORKSSpreadsheet7.0.xlr", new OfficeParser()).xml);
- }
-
- /**
- * We don't currently support the .xlsb file format
- * (an OOXML container with binary blobs), but we
- * shouldn't break on these files either (TIKA-826)
- */
- @Test
- public void testExcelXLSB() throws Exception {
- Detector detector = new DefaultDetector();
- AutoDetectParser parser = new AutoDetectParser();
-
- Metadata m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
-
- // Should be detected correctly
- MediaType type;
- try (InputStream input = getTestDocumentAsStream("testEXCEL.xlsb")) {
- type = detector.detect(input, m);
- assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
- }
-
- // OfficeParser won't handle it
- assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
-
- // OOXMLParser won't handle it
- assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
- // AutoDetectParser doesn't break on it
- assertContains("<body />", getXML("testEXCEL.xlsb").xml);
-
- }
-
- /**
- * Excel 5 and 95 are older formats, and only get basic support
- */
- @Test
- public void testExcel95() throws Exception {
- Detector detector = new DefaultDetector();
- AutoDetectParser parser = new AutoDetectParser();
- MediaType type;
- Metadata m;
-
- // First try detection of Excel 5
- m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
- try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
- type = detector.detect(input, m);
- assertEquals("application/vnd.ms-excel", type.toString());
- }
-
- // Now Excel 95
- m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
- try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
- type = detector.detect(input, m);
- assertEquals("application/vnd.ms-excel", type.toString());
- }
-
- // OfficeParser can handle it
- assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
-
- // OOXMLParser won't handle it
- assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
-
-
- // Parse the Excel 5 file
- m = new Metadata();
- try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- parser.parse(input, handler, m, context);
-
- String content = handler.toString();
-
- // Sheet names
- assertContains("Feuil1", content);
- assertContains("Feuil3", content);
-
- // Text
- assertContains("Sample Excel", content);
- assertContains("Number", content);
-
- // Numbers
- assertContains("15", content);
- assertContains("225", content);
-
- // Metadata was also fetched
- assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
- assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
- }
-
- // Parse the Excel 95 file
- m = new Metadata();
- try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- parser.parse(input, handler, m, context);
-
- String content = handler.toString();
-
- // Sheet name
- assertContains("Foglio1", content);
-
- // Very boring file, no actual text or numbers!
-
- // Metadata was also fetched
- assertEquals(null, m.get(TikaCoreProperties.TITLE));
- assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
- }
- }
-
- /**
- * Ensures that custom OLE2 (HPSF) properties are extracted
- */
- @Test
- public void testCustomProperties() throws Exception {
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
-
- XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), new Metadata(), context);
- Metadata metadata = r.metadata;
- assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
- assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
- assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
- assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
- assertEquals("true", metadata.get("custom:myCustomBoolean"));
- assertEquals("3", metadata.get("custom:myCustomNumber"));
- assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
- assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
- assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
- }
-
- @Test
- public void testHeaderAndFooterExtraction() throws Exception {
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.UK);
-
- XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(),
- new Metadata(), context);
-
- Metadata metadata = r.metadata;
- assertEquals(
- "application/vnd.ms-excel",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
- assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
- assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
-
- String content = r.xml;
- assertContains("John Smith1", content);
- assertContains("John Smith50", content);
- assertContains("1 Corporate HQ", content);
- assertContains("Header - Corporate Spreadsheet", content);
- assertContains("Header - For Internal Use Only", content);
- assertContains("Header - Author: John Smith", content);
- assertContains("Footer - Corporate Spreadsheet", content);
- assertContains("Footer - For Internal Use Only", content);
- assertContains("Footer - Author: John Smith", content);
-
- }
-
- @Test
- public void testHyperlinksInXLS() throws Exception {
- String xml = getXML("testEXCEL_hyperlinks.xls").xml;
- //external url
- assertContains("<a href=\"http://tika.apache.org/\">", xml);
- //mail url
- assertContains("<a href=\"mailto:user@tika.apache.org?subject=help\">", xml);
- //external linked file
- assertContains("<a href=\"linked_file.txt.htm\">", xml);
-
- //TODO: not extracting these yet
- //link on textbox
-// assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
- }
-
- @Test
- public void testEmbeddedPDF() throws Exception {
- List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
- assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
- }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.io.InputStream;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.detect.DefaultDetector;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.exception.EncryptedDocumentException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.PasswordProvider;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class ExcelParserTest extends TikaTest {
+ @Test
+ @SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata keys
+ public void testExcelParser() throws Exception {
+
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ XMLResult r = getXML("testEXCEL.xls", new OfficeParser(), new Metadata(), context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Simple Excel document", r.metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", r.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Keith Bennett", r.metadata.get(Metadata.AUTHOR));
+
+ // Mon Oct 01 17:13:56 BST 2007
+ assertEquals("2007-10-01T16:13:56Z", r.metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("2007-10-01T16:13:56Z", r.metadata.get(Metadata.CREATION_DATE));
+
+ // Mon Oct 01 17:31:43 BST 2007
+ assertEquals("2007-10-01T16:31:43Z", r.metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2007-10-01T16:31:43Z", r.metadata.get(Metadata.DATE));
+
+ String content = r.xml;
+ assertContains("Sample Excel Worksheet", content);
+ assertContains("Numbers and their Squares", content);
+ assertContains("<tr>\t<td />\t<td>Number</td>\t<td>Square", content);
+ assertContains("9", content);
+ assertNotContained("9.0", content);
+ assertContains("196", content);
+ assertNotContained("196.0", content);
+
+ }
+
+ @Test
+ public void testExcelParserFormatting() throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ XMLResult r = getXML("testEXCEL-formats.xls", new OfficeParser(), new Metadata(), context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = r.xml;
+
+ // Number #,##0.00
+ assertContains("1,599.99", content);
+ assertContains("-1,599.99", content);
+
+ // Currency $#,##0.00;[Red]($#,##0.00)
+ assertContains("$1,599.99", content);
+ assertContains("($1,599.99)", content);
+
+ // Scientific 0.00E+00
+ // poi <=3.8beta1 returns 1.98E08, newer versions return 1.98+E08
+ assertTrue(content.contains("1.98E08") || content.contains("1.98E+08"));
+ assertTrue(content.contains("-1.98E08") || content.contains("-1.98E+08"));
+
+ // Percentage.
+ assertContains("2.50%", content);
+ // Excel rounds up to 3%, but that requires Java 1.6 or later
+ if (System.getProperty("java.version").startsWith("1.5")) {
+ assertContains("2%", content);
+ } else {
+ assertContains("3%", content);
+ }
+
+ // Time Format: h:mm
+ assertContains("6:15", content);
+ assertContains("18:15", content);
+
+ // Date Format: d-mmm-yy
+ assertContains("17-May-07", content);
+
+ // Date Format: m/d/yy
+ assertContains("10/3/09", content);
+
+ // Date/Time Format: m/d/yy h:mm
+ assertContains("1/19/08 4:35", content);
+
+ // Fraction (2.5): # ?/?
+ assertContains("2 1/2", content);
+
+
+ // Below assertions represent outstanding formatting issues to be addressed
+ // they are included to allow the issues to be progressed with the Apache POI
+ // team - See TIKA-103.
+
+ /*************************************************************************
+ // Custom Number (0 "dollars and" .00 "cents")
+ assertContains("19 dollars and .99 cents", content);
+
+ // Custom Number ("At" h:mm AM/PM "on" dddd mmmm d"," yyyy)
+ assertContains("At 4:20 AM on Thursday May 17, 2007", content);
+ **************************************************************************/
+
+
+ }
+
+ @Test
+ public void testExcelParserPassword() throws Exception {
+ try {
+ XMLResult r = getXML("testEXCEL_protected_passtika.xls");
+ fail("Document is encrypted, shouldn't parse");
+ } catch (EncryptedDocumentException e) {
+ // Good
+ }
+
+ // Try again, this time with the password
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ context.set(PasswordProvider.class, new PasswordProvider() {
+ @Override
+ public String getPassword(Metadata metadata) {
+ return "tika";
+ }
+ });
+ XMLResult r = getXML("testEXCEL_protected_passtika.xls", new OfficeParser(), new Metadata(), context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+
+ assertEquals(null, r.metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Antoni", r.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("2011-11-25T09:52:48Z", r.metadata.get(TikaCoreProperties.CREATED));
+
+ String content = r.xml;
+ assertContains("This is an Encrypted Excel spreadsheet", content);
+ assertNotContained("9.0", content);
+
+ }
+
+ /**
+ * TIKA-214 - Ensure we extract labels etc from Charts
+ */
+ @Test
+ public void testExcelParserCharts() throws Exception {
+
+ XMLResult r = getXML("testEXCEL-charts.xls", new OfficeParser());
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = r.xml;
+
+ // The first sheet has a pie chart
+ assertContains("charttabyodawg", content);
+ assertContains("WhamPuff", content);
+
+ // The second sheet has a bar chart and some text
+ assertContains("Sheet1", content);
+ assertContains("Test Excel Spreasheet", content);
+ assertContains("foo", content);
+ assertContains("bar", content);
+ assertContains("fizzlepuff", content);
+ assertContains("whyaxis", content);
+ assertContains("eksaxis", content);
+
+ // The third sheet has some text
+ assertContains("Sheet2", content);
+ assertContains("dingdong", content);
+
+ }
+
+ @Test
+ public void testJXL() throws Exception {
+
+ XMLResult r = getXML("jxl.xls", new OfficeParser());
+ assertEquals(
+ "application/vnd.ms-excel",
+ r.metadata.get(Metadata.CONTENT_TYPE));
+ assertContains("Number Formats", r.xml);
+
+ }
+
+ @Test
+ public void testWorksSpreadsheet70() throws Exception {
+ assertContains("Microsoft Works",
+ getXML("testWORKSSpreadsheet7.0.xlr", new OfficeParser()).xml);
+ }
+
+ /**
+ * We don't currently support the .xlsb file format
+ * (an OOXML container with binary blobs), but we
+ * shouldn't break on these files either (TIKA-826)
+ */
+ @Test
+ public void testExcelXLSB() throws Exception {
+ Detector detector = new DefaultDetector();
+ AutoDetectParser parser = new AutoDetectParser();
+
+ Metadata m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");
+
+ // Should be detected correctly
+ MediaType type;
+ try (InputStream input = getTestDocumentAsStream("testEXCEL.xlsb")) {
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
+ }
+
+ // OfficeParser won't handle it
+ assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // OOXMLParser won't handle it
+ assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // AutoDetectParser doesn't break on it
+ assertContains("<body />", getXML("testEXCEL.xlsb").xml);
+
+ }
+
+ /**
+ * Excel 5 and 95 are older formats, and only get basic support
+ */
+ @Test
+ public void testExcel95() throws Exception {
+ Detector detector = new DefaultDetector();
+ AutoDetectParser parser = new AutoDetectParser();
+ MediaType type;
+ Metadata m;
+
+ // First try detection of Excel 5
+ m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel", type.toString());
+ }
+
+ // Now Excel 95
+ m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel", type.toString());
+ }
+
+ // OfficeParser can handle it
+ assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+ // OOXMLParser won't handle it
+ assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
+
+
+ // Parse the Excel 5 file
+ m = new Metadata();
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_5.xls")) {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ parser.parse(input, handler, m, context);
+
+ String content = handler.toString();
+
+ // Sheet names
+ assertContains("Feuil1", content);
+ assertContains("Feuil3", content);
+
+ // Text
+ assertContains("Sample Excel", content);
+ assertContains("Number", content);
+
+ // Numbers
+ assertContains("15", content);
+ assertContains("225", content);
+
+ // Metadata was also fetched
+ assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
+ }
+
+ // Parse the Excel 95 file
+ m = new Metadata();
+ try (InputStream input = getTestDocumentAsStream("testEXCEL_95.xls")) {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ parser.parse(input, handler, m, context);
+
+ String content = handler.toString();
+
+ // Sheet name
+ assertContains("Foglio1", content);
+
+ // Very boring file, no actual text or numbers!
+
+ // Metadata was also fetched
+ assertEquals(null, m.get(TikaCoreProperties.TITLE));
+ assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
+ }
+ }
+
+ /**
+ * Ensures that custom OLE2 (HPSF) properties are extracted
+ */
+ @Test
+ public void testCustomProperties() throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+
+ XMLResult r = getXML("testEXCEL_custom_props.xls", new OfficeParser(), new Metadata(), context);
+ Metadata metadata = r.metadata;
+ assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("", metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals("2011-08-22T13:45:54Z", metadata.get(TikaCoreProperties.MODIFIED));
+ assertEquals("2006-09-12T15:06:44Z", metadata.get(TikaCoreProperties.CREATED));
+ assertEquals("Microsoft Excel", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+ assertEquals("true", metadata.get("custom:myCustomBoolean"));
+ assertEquals("3", metadata.get("custom:myCustomNumber"));
+ assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
+ assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
+ assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
+ }
+
+ @Test
+ public void testHeaderAndFooterExtraction() throws Exception {
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.UK);
+
+ XMLResult r = getXML("testEXCEL_headers_footers.xls", new OfficeParser(),
+ new Metadata(), context);
+
+ Metadata metadata = r.metadata;
+ assertEquals(
+ "application/vnd.ms-excel",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Internal spreadsheet", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Aeham Abushwashi", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
+
+ String content = r.xml;
+ assertContains("John Smith1", content);
+ assertContains("John Smith50", content);
+ assertContains("1 Corporate HQ", content);
+ assertContains("Header - Corporate Spreadsheet", content);
+ assertContains("Header - For Internal Use Only", content);
+ assertContains("Header - Author: John Smith", content);
+ assertContains("Footer - Corporate Spreadsheet", content);
+ assertContains("Footer - For Internal Use Only", content);
+ assertContains("Footer - Author: John Smith", content);
+
+ }
+
+ @Test
+ public void testHyperlinksInXLS() throws Exception {
+ String xml = getXML("testEXCEL_hyperlinks.xls").xml;
+ //external url
+ assertContains("<a href=\"http://tika.apache.org/\">", xml);
+ //mail url
+ assertContains("<a href=\"mailto:user@tika.apache.org?subject=help\">", xml);
+ //external linked file
+ assertContains("<a href=\"linked_file.txt.htm\">", xml);
+
+ //TODO: not extracting these yet
+ //link on textbox
+// assertContains("<a href=\"http://tika.apache.org/1.12/gettingstarted.html\">", xml);
+ }
+
+ @Test
+ public void testEmbeddedPDF() throws Exception {
+ List<Metadata> metadataList = getRecursiveJson("testEXCEL_embeddedPDF.xls");
+ assertEquals("application/pdf", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
index 07644dd..beffee6 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OfficeParserTest.java
@@ -1,46 +1,46 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertTrue;
-
-import java.io.InputStream;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
-import org.junit.Test;
-
-
-public class OfficeParserTest extends TikaTest {
-
- @Test
- public void parseOfficeWord() throws Exception {
- Metadata metadata = new Metadata();
- Parser parser = new OfficeParser();
-
- String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml;
-
- assertTrue(xml.contains("test"));
- }
-
- private InputStream getTestDocument(String name) {
- return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name));
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
+import org.junit.Test;
+
+
+public class OfficeParserTest extends TikaTest {
+
+ @Test
+ public void parseOfficeWord() throws Exception {
+ Metadata metadata = new Metadata();
+ Parser parser = new OfficeParser();
+
+ String xml = getXML(getTestDocument("test.doc"), parser, metadata).xml;
+
+ assertTrue(xml.contains("test"));
+ }
+
+ private InputStream getTestDocument(String name) {
+ return TikaInputStream.get(OOXMLParserTest.class.getResourceAsStream("/test-documents/" + name));
+ }
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index fbf8114..8662e65 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -1,239 +1,239 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.microsoft;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
-import javax.xml.transform.OutputKeys;
-import javax.xml.transform.sax.SAXTransformerFactory;
-import javax.xml.transform.sax.TransformerHandler;
-import javax.xml.transform.stream.StreamResult;
-import java.io.InputStream;
-import java.io.StringWriter;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.tika.TikaTest;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
-import org.junit.Test;
-import org.xml.sax.ContentHandler;
-
-/**
- * Test case for parsing Outlook files.
- */
-public class OutlookParserTest extends TikaTest {
-
- @Test
- public void testOutlookParsing() throws Exception {
- Parser parser = new AutoDetectParser(); // Should auto-detect!
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/test-outlook.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals(
- "application/vnd.ms-outlook",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals(
- "Microsoft Outlook Express 6",
- metadata.get(TikaCoreProperties.TITLE));
- assertEquals(
- "Nouvel utilisateur de Outlook Express",
- metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
- assertEquals(
- "L'\u00C9quipe Microsoft Outlook Express",
- metadata.get(TikaCoreProperties.CREATOR));
- assertEquals(
- "L'\u00C9quipe Microsoft Outlook Express",
- metadata.get(Metadata.AUTHOR));
-
- // Stored as Thu, 5 Apr 2007 09:26:06 -0700
- assertEquals(
- "2007-04-05T16:26:06Z",
- metadata.get(TikaCoreProperties.CREATED));
-
- String content = handler.toString();
- assertContains("Microsoft Outlook Express 6", content);
- assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
- assertContains("Nouvel utilisateur de Outlook Express", content);
- assertContains("Messagerie et groupes de discussion", content);
- }
-
- /**
- * Test case for TIKA-197
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
- */
- @Test
- public void testMultipleCopies() throws Exception {
- Parser parser = new AutoDetectParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/testMSG.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals(
- "application/vnd.ms-outlook",
- metadata.get(Metadata.CONTENT_TYPE));
-
- String content = handler.toString();
- Pattern pattern = Pattern.compile("From");
- Matcher matcher = pattern.matcher(content);
- assertTrue(matcher.find());
- assertFalse(matcher.find());
- }
-
- /**
- * Test case for TIKA-395, to ensure parser works for new Outlook formats.
- *
- * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
- */
- @Test
- public void testOutlookNew() throws Exception {
- Parser parser = new AutoDetectParser();
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/test-outlook2003.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- assertEquals(
- "application/vnd.ms-outlook",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals(
- "Welcome to Microsoft Office Outlook 2003",
- metadata.get(TikaCoreProperties.TITLE));
-
- String content = handler.toString();
- assertContains("Outlook 2003", content);
- assertContains("Streamlined Mail Experience", content);
- assertContains("Navigation Pane", content);
- }
-
- @Test
- public void testOutlookHTMLVersion() throws Exception {
- Parser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
-
- // Check the HTML version
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.setResult(new StreamResult(sw));
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/testMSG_chinese.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- // As the HTML version should have been processed, ensure
- // we got some of the links
- String content = sw.toString();
- assertContains("<dd>tests.chang@fengttt.com</dd>", content);
- assertContains("<p>Alfresco MSG format testing", content);
- assertContains("<li>1", content);
- assertContains("<li>2", content);
-
- // Make sure we don't have nested html docs
- assertEquals(2, content.split("<body>").length);
- assertEquals(2, content.split("<\\/body>").length);
-
- // Make sure that the Chinese actually came through
- assertContains("\u5F35\u6BD3\u502B", metadata.get(TikaCoreProperties.CREATOR));
- assertContains("\u9673\u60E0\u73CD", content);
- }
-
- @Test
- public void testOutlookForwarded() throws Exception {
- Parser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
-
- // Check the HTML version
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.setResult(new StreamResult(sw));
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/testMSG_forwarded.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- // Make sure we don't have nested docs
- String content = sw.toString();
- assertEquals(2, content.split("<body>").length);
- assertEquals(2, content.split("<\\/body>").length);
- }
-
- @Test
- public void testOutlookHTMLfromRTF() throws Exception {
- Parser parser = new AutoDetectParser();
- Metadata metadata = new Metadata();
-
- // Check the HTML version
- StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
- SAXTransformerFactory.newInstance();
- TransformerHandler handler = factory.newTransformerHandler();
- handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
- handler.setResult(new StreamResult(sw));
-
- try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
- "/test-documents/test-outlook2003.msg")) {
- parser.parse(stream, handler, metadata, new ParseContext());
- }
-
- // As the HTML version should have been processed, ensure
- // we got some of the links
- String content = sw.toString().replaceAll("<p>\\s+", "<p>");
- assertContains("<dd>New Outlook User</dd>", content);
- assertContains("designed <i>to help you", content);
- assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
-
- // Link - check text around it, and the link itself
- assertContains("sign up for a free subscription", content);
- assertContains("Office Newsletter", content);
- assertContains("newsletter will be sent to you", content);
- assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);
-
- // Make sure we don't have nested html docs
- assertEquals(2, content.split("<body>").length);
- assertEquals(2, content.split("<\\/body>").length);
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Test case for parsing Outlook files.
+ */
+public class OutlookParserTest extends TikaTest {
+
+ @Test
+ public void testOutlookParsing() throws Exception {
+ Parser parser = new AutoDetectParser(); // Should auto-detect!
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/test-outlook.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals(
+ "application/vnd.ms-outlook",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(
+ "Microsoft Outlook Express 6",
+ metadata.get(TikaCoreProperties.TITLE));
+ assertEquals(
+ "Nouvel utilisateur de Outlook Express",
+ metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
+ assertEquals(
+ "L'\u00C9quipe Microsoft Outlook Express",
+ metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals(
+ "L'\u00C9quipe Microsoft Outlook Express",
+ metadata.get(Metadata.AUTHOR));
+
+ // Stored as Thu, 5 Apr 2007 09:26:06 -0700
+ assertEquals(
+ "2007-04-05T16:26:06Z",
+ metadata.get(TikaCoreProperties.CREATED));
+
+ String content = handler.toString();
+ assertContains("Microsoft Outlook Express 6", content);
+ assertContains("L'\u00C9quipe Microsoft Outlook Express", content);
+ assertContains("Nouvel utilisateur de Outlook Express", content);
+ assertContains("Messagerie et groupes de discussion", content);
+ }
+
+ /**
+ * Test case for TIKA-197
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-197">TIKA-197</a>
+ */
+ @Test
+ public void testMultipleCopies() throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/testMSG.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals(
+ "application/vnd.ms-outlook",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = handler.toString();
+ Pattern pattern = Pattern.compile("From");
+ Matcher matcher = pattern.matcher(content);
+ assertTrue(matcher.find());
+ assertFalse(matcher.find());
+ }
+
+ /**
+ * Test case for TIKA-395, to ensure parser works for new Outlook formats.
+ *
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-395">TIKA-395</a>
+ */
+ @Test
+ public void testOutlookNew() throws Exception {
+ Parser parser = new AutoDetectParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/test-outlook2003.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals(
+ "application/vnd.ms-outlook",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals(
+ "Welcome to Microsoft Office Outlook 2003",
+ metadata.get(TikaCoreProperties.TITLE));
+
+ String content = handler.toString();
+ assertContains("Outlook 2003", content);
+ assertContains("Streamlined Mail Experience", content);
+ assertContains("Navigation Pane", content);
+ }
+
+ @Test
+ public void testOutlookHTMLVersion() throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+
+ // Check the HTML version
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(sw));
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/testMSG_chinese.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ // As the HTML version should have been processed, ensure
+ // we got some of the links
+ String content = sw.toString();
+ assertContains("<dd>tests.chang@fengttt.com</dd>", content);
+ assertContains("<p>Alfresco MSG format testing", content);
+ assertContains("<li>1", content);
+ assertContains("<li>2", content);
+
+ // Make sure we don't have nested html docs
+ assertEquals(2, content.split("<body>").length);
+ assertEquals(2, content.split("<\\/body>").length);
+
+ // Make sure that the Chinese actually came through
+ assertContains("\u5F35\u6BD3\u502B", metadata.get(TikaCoreProperties.CREATOR));
+ assertContains("\u9673\u60E0\u73CD", content);
+ }
+
+ @Test
+ public void testOutlookForwarded() throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+
+ // Check the HTML version
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(sw));
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/testMSG_forwarded.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ // Make sure we don't have nested docs
+ String content = sw.toString();
+ assertEquals(2, content.split("<body>").length);
+ assertEquals(2, content.split("<\\/body>").length);
+ }
+
+ @Test
+ public void testOutlookHTMLfromRTF() throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+
+ // Check the HTML version
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(sw));
+
+ try (InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/test-outlook2003.msg")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ // As the HTML version should have been processed, ensure
+ // we got some of the links
+ String content = sw.toString().replaceAll("<p>\\s+", "<p>");
+ assertContains("<dd>New Outlook User</dd>", content);
+ assertContains("designed <i>to help you", content);
+ assertContains("<p><a href=\"http://r.office.microsoft.com/r/rlidOutlookWelcomeMail10?clid=1033\">Cached Exchange Mode</a>", content);
+
+ // Link - check text around it, and the link itself
+ assertContains("sign up for a free subscription", content);
+ assertContains("Office Newsletter", content);
+ assertContains("newsletter will be sent to you", content);
+ assertContains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033", content);
+
+ // Make sure we don't have nested html docs
+ assertEquals(2, content.split("<body>").length);
+ assertEquals(2, content.split("<\\/body>").length);
+ }
+}