You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/09/23 22:54:37 UTC

svn commit: r1175014 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java

Author: nick
Date: Fri Sep 23 20:54:36 2011
New Revision: 1175014

URL: http://svn.apache.org/viewvc?rev=1175014&view=rev
Log:
Add a disabled Outlook RTF related test, pending a fix for TIKA-632. (We're nearly there with the recent RTF improvements, but not quite....)

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1175014&r1=1175013&r2=1175014&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Fri Sep 23 20:54:36 2011
@@ -198,8 +198,8 @@ public class OutlookExtractor extends Ab
               MAPIRtfAttribute rtf = new MAPIRtfAttribute(
                     MAPIProperty.RTF_COMPRESSED, Types.BINARY, chunk.getValue()
               );
-              RTFParser rtfParser = new RTFParser();
               // Disabled pending a fix to TIKA-632
+//              RTFParser rtfParser = new RTFParser();
 //              rtfParser.parse(
 //                    new ByteArrayInputStream(rtf.getData()),
 //                    xhtml, new Metadata(), new ParseContext()

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java?rev=1175014&r1=1175013&r2=1175014&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java Fri Sep 23 20:54:36 2011
@@ -172,4 +172,47 @@ public class OutlookParserTest extends T
         assertEquals(2, content.split("<body>").length);
         //assertEquals(2, content.split("<\\/body>").length); // TODO Fix
     }
+    
+    /**
+     * Disabled pending a fix for TIKA-632
+     */
+    public void DISABLEDtestOutlookHTMLfromRTF() throws Exception {
+        Parser parser = new AutoDetectParser();
+        Metadata metadata = new Metadata();
+       
+        // Check the HTML version
+        StringWriter sw = new StringWriter();
+        SAXTransformerFactory factory = (SAXTransformerFactory)
+                 SAXTransformerFactory.newInstance();
+        TransformerHandler handler = factory.newTransformerHandler();
+        handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+        handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+        handler.setResult(new StreamResult(sw));
+
+        InputStream stream = OutlookParserTest.class.getResourceAsStream(
+                "/test-documents/test-outlook2003.msg");
+        try {
+           parser.parse(stream, handler, metadata, new ParseContext());
+        } finally {
+           stream.close();
+        }
+         
+        // As the HTML version should have been processed, ensure
+        //  we got some of the links
+        String content = sw.toString().replaceAll("<p>\\s+","<p>");
+//System.err.println(content);
+        assertTrue(content.contains("<dd>New Outlook User</dd>"));
+        assertTrue(content.contains("designed <i>to help you"));
+        assertTrue(content.contains("<p>Cached Exchange Mode"));
+        
+        // Link - check text around it, and the link itself
+        assertTrue(content.contains("sign up for a free subscription"));
+        assertTrue(content.contains("Office Newsletter"));
+        assertTrue(content.contains("newsletter will be sent to you"));
+        assertTrue(content.contains("http://r.office.microsoft.com/r/rlidNewsletterSignUp?clid=1033"));
+        
+        // Make sure we don't have nested html docs
+        assertEquals(2, content.split("<body>").length);
+        //assertEquals(2, content.split("<\\/body>").length); // TODO Fix
+    }
 }