You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2008/07/10 01:59:04 UTC
svn commit: r675384 - in /incubator/tika/trunk/src:
main/java/org/apache/tika/parser/microsoft/ main/resources/
main/resources/mime/ test/java/org/apache/tika/
test/resources/test-documents/
Author: jukka
Date: Wed Jul 9 16:59:03 2008
New Revision: 675384
URL: http://svn.apache.org/viewvc?rev=675384&view=rev
Log:
TIKA-54: Outlook msg parser
- Patch by Dave Meikle
- Test file by Rida Benjelloun
Added:
incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java
incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg (with props)
Modified:
incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
incubator/tika/trunk/src/main/resources/tika-config.xml
incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
Added: incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java?rev=675384&view=auto
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java (added)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/microsoft/OutlookMessageParser.java Wed Jul 9 16:59:03 2008
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.InputStream;
+import java.io.IOException;
+
+/**
+ * Outlook Message Parser.
+ */
+public class OutlookMessageParser extends AbstractParser {
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, TikaException, SAXException {
+ try {
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ MAPIMessage msg = new MAPIMessage(stream);
+ metadata.add("from", msg.getDisplayFrom());
+ metadata.add("to", msg.getDisplayTo());
+ metadata.add(Metadata.SUBJECT, msg.getSubject());
+ metadata.add("messageClass", msg.getMessageClass());
+ metadata.add("conversationTopic", msg.getConversationTopic());
+
+ xhtml.element("p", msg.getTextBody());
+ xhtml.endDocument();
+ }
+ catch (ChunkNotFoundException ex) {
+ throw new TikaException("Error parsing message.");
+ }
+ }
+}
Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=675384&r1=675383&r2=675384&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Wed Jul 9 16:59:03 2008
@@ -184,6 +184,10 @@
<alias type="application/msexcel" />
</mime-type>
+ <mime-type type="application/vnd.ms-outlook">
+ <glob pattern="*.msg" />
+ </mime-type>
+
<!-- ===================================================================== -->
<!-- Open Document Format for Office Applications (OpenDocument) v1.0 -->
<!-- http://www.oasis-open.org/specs/index.php#opendocumentv1.0 -->
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=675384&r1=675383&r2=675384&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Wed Jul 9 16:59:03 2008
@@ -36,6 +36,10 @@
<mime>application/vnd.visio</mime>
</parser>
+ <parser name="parse-outlook" class="org.apache.tika.parser.microsoft.OutlookMessageParser">
+ <mime>application/vnd.ms-outlook</mime>
+ </parser>
+
<parser name="parse-html" class="org.apache.tika.parser.html.HtmlParser">
<mime>text/html</mime>
<mime>application/x-asp</mime>
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=675384&r1=675383&r2=675384&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Wed Jul 9 16:59:03 2008
@@ -153,6 +153,14 @@
assertEquals(s1, s2);
}
+ public void testOutlookExtraction() throws Exception {
+ File file = getTestFile("test-outlook.msg");
+ String s1 = ParseUtils.getStringContent(file, tc);
+ String s2 = ParseUtils.getStringContent(file, tc,
+ "application/vnd.ms-outlook");
+ assertEquals(s1, s2);
+ }
+
public void testHTMLExtraction() throws Exception {
File file = getTestFile("testHTML.html");
String s1 = ParseUtils.getStringContent(file, tc);
Added: incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg?rev=675384&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/tika/trunk/src/test/resources/test-documents/test-outlook.msg
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream