You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/09/23 22:54:37 UTC
svn commit: r1175014 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Author: nick
Date: Fri Sep 23 20:54:36 2011
New Revision: 1175014
URL: http://svn.apache.org/viewvc?rev=1175014&view=rev
Log:
Add a disabled Outlook RTF related test, pending a fix for TIKA-632. (We're nearly there with the recent RTF improvements, but not quite....)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1175014&r1=1175013&r2=1175014&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Fri Sep 23 20:54:36 2011
@@ -198,8 +198,8 @@ public class OutlookExtractor extends Ab
MAPIRtfAttribute rtf = new MAPIRtfAttribute(
MAPIProperty.RTF_COMPRESSED, Types.BINARY, chunk.getValue()
);
- RTFParser rtfParser = new RTFParser();
// Disabled pending a fix to TIKA-632
+// RTFParser rtfParser = new RTFParser();
// rtfParser.parse(
// new ByteArrayInputStream(rtf.getData()),
// xhtml, new Metadata(), new ParseContext()
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1175014&r1=1175013&r2=1175014&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Fri Sep 23 20:54:36 2011
@@ -172,4 +172,47 @@ public class OutlookParserTest extends T
assertEquals(2, content.split("<body>").length);
//assertEquals(2, content.split("<\\/body>").length); // TODO Fix
}
+
+ /**
+ * Disabled pending a fix for TIKA-632
+ */
+ public void DISABLEDtestOutlookHTMLfromRTF() throws Exception {
+ Parser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+
+ // Check the HTML version
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(sw));
+
+ InputStream stream = OutlookParserTest.class.getResourceAsStream(
+ "/test-documents/test-outlook2003.msg");
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ } finally {
+ stream.close();
+ }
+
+ // As the HTML version should have been processed, ensure
+ // we got some of the links
+ String content = sw.toString().replaceAll("<p>\\s+","<p>");
+//System.err.println(content);
+ assertTrue(content.contains("<dd>New Outlook User</dd>"));
+ assertTrue(content.contains("designed <i>to help you"));
+ assertTrue(content.contains("<p>Cached Exchange Mode"));
+
+ // Link - check text around it, and the link itself
+ assertTrue(content.contains("sign up for a free subscription"));
+ assertTrue(content.contains("Office Newsletter"));
+ assertTrue(content.contains("newsletter will be sent to you"));
+ assertTrue(content.contains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033"));
+
+ // Make sure we don't have nested html docs
+ assertEquals(2, content.split("<body>").length);
+ //assertEquals(2, content.split("<\\/body>").length); // TODO Fix
+ }
}