You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/07/10 01:59:04 UTC

svn commit: r675384 - in /incubator/tika/trunk/src: main/java/org/apache/tika/parser/microsoft/ main/resources/ main/resources/mime/ test/java/org/apache/tika/ test/resources/test-documents/

Author: jukka
Date: Wed Jul  9 16:59:03 2008
New Revision: 675384

URL: http://svn.apache.org/viewvc?rev=675384&view=rev
Log:
TIKA-54: Outlook msg parser
    - Patch by Dave Meikle
    - Test file by Rida Benjelloun

Added:
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java
    incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg   (with props)
Modified:
    incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
    incubator/tika/trunk/src/main/resources/tika-config.xml
    incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java

Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java?rev=675384&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java Wed Jul  9 16:59:03 2008
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.InputStream;
+import java.io.IOException;
+
+/**
+ * Outlook Message Parser.
+ */
+public class OutlookMessageParser extends AbstractParser {
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata)
+            throws IOException, TikaException, SAXException {
+        try {
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+            xhtml.startDocument();
+
+            MAPIMessage msg = new MAPIMessage(stream);
+            metadata.add("from", msg.getDisplayFrom());
+            metadata.add("to", msg.getDisplayTo());
+            metadata.add(Metadata.SUBJECT, msg.getSubject());
+            metadata.add("messageClass", msg.getMessageClass());
+            metadata.add("conversationTopic", msg.getConversationTopic());
+
+            xhtml.element("p", msg.getTextBody());
+            xhtml.endDocument();
+        }
+        catch (ChunkNotFoundException ex) {
+            throw new TikaException("Error parsing message.");
+        }
+    }
+}

Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=675384&r1=675383&r2=675384&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Wed Jul  9 16:59:03 2008
@@ -184,6 +184,10 @@
     <alias type="application/msexcel" />
   </mime-type>
 
+  <mime-type type="application/vnd.ms-outlook">
+    <glob pattern="*.msg" />
+  </mime-type>
+
   <!-- ===================================================================== -->
   <!-- Open Document Format for Office Applications (OpenDocument) v1.0      -->
   <!-- http://www.oasis-open.org/specs/index.php#opendocumentv1.0            -->

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=675384&r1=675383&r2=675384&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Wed Jul  9 16:59:03 2008
@@ -36,6 +36,10 @@
                 <mime>application/vnd.visio</mime>
         </parser>
 
+        <parser name="parse-outlook" class="org.apache.tika.parser.microsoft.OutlookMessageParser">
+                <mime>application/vnd.ms-outlook</mime>
+        </parser>
+
         <parser name="parse-html" class="org.apache.tika.parser.html.HtmlParser">
                 <mime>text/html</mime>
                 <mime>application/x-asp</mime>

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=675384&r1=675383&r2=675384&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Wed Jul  9 16:59:03 2008
@@ -153,6 +153,14 @@
         assertEquals(s1, s2);
     }
 
+    public void testOutlookExtraction() throws Exception {
+        File file = getTestFile("test-outlook.msg");
+        String s1 = ParseUtils.getStringContent(file, tc);
+        String s2 = ParseUtils.getStringContent(file, tc,
+        "application/vnd.ms-outlook");
+        assertEquals(s1, s2);
+    }
+
     public void testHTMLExtraction() throws Exception {
         File file = getTestFile("testHTML.html");
         String s1 = ParseUtils.getStringContent(file, tc);

Added: incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg?rev=675384&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream