You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/04/01 17:41:52 UTC

svn commit: r1087772 - /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java

Author: nick
Date: Fri Apr  1 15:41:52 2011
New Revision: 1087772

URL: http://svn.apache.org/viewvc?rev=1087772&view=rev
Log:
TIKA-631 - Stub out the work for improving the outlook parsing WRT html body content and better encoding detection

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1087772&r1=1087771&r2=1087772&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Fri Apr  1 15:41:52 2011
@@ -16,16 +16,24 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 
 import org.apache.poi.hsmf.MAPIMessage;
 import org.apache.poi.hsmf.datatypes.AttachmentChunks;
+import org.apache.poi.hsmf.datatypes.ByteChunk;
+import org.apache.poi.hsmf.datatypes.Chunk;
+import org.apache.poi.hsmf.datatypes.MAPIProperty;
+import org.apache.poi.hsmf.datatypes.StringChunk;
+import org.apache.poi.hsmf.datatypes.Types;
 import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.txt.CharsetDetector;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
@@ -49,7 +57,34 @@ public class OutlookExtractor extends Ab
             throws TikaException, SAXException, IOException {
         try {
            msg.setReturnNullOnMissingChunk(true);
-          
+           
+           // If the message contains strings that aren't stored
+           //  as Unicode, try to sort out an encoding for them
+           // TODO Use new method
+           boolean hasNonUnicodeStrings = false;
+           for(Chunk chunk : msg.getMainChunks().getAll()) {
+              if(chunk instanceof StringChunk) {
+                 StringChunk sc = (StringChunk)chunk;
+                 if(sc.getType() == Types.ASCII_STRING) {
+                    hasNonUnicodeStrings = true;
+                    break;
+                 }
+              }
+           }
+           
+           if(hasNonUnicodeStrings) {
+              if(msg.getHeaders() != null) {
+                 // There's normally something in the headers
+                 msg.guess7BitEncoding();
+              } else {
+                 // Nothing in the header, try encoding detection
+                 //  on the message body
+                 CharsetDetector detector = new CharsetDetector();
+                 // TODO detect and use this
+              }
+           }
+           
+           // Start with the metadata
            String subject = msg.getSubject();
            String from = msg.getDisplayFrom();
    
@@ -81,7 +116,7 @@ public class OutlookExtractor extends Ab
                  if(headers != null && headers.length > 0) {
                      for(String header: headers) {
                         if(header.toLowerCase().startsWith("date:")) {
-                    	String date = header.substring(header.indexOf(':')+1);
+                            String date = header.substring(header.indexOf(':')+1);
                             metadata.set(Metadata.EDIT_TIME, date);
                             metadata.set(Metadata.LAST_SAVED, date);
                             break;
@@ -110,8 +145,47 @@ public class OutlookExtractor extends Ab
            } catch(ChunkNotFoundException e) {}
            xhtml.endElement("dl");
    
-           xhtml.element("p", msg.getTextBody());
+           // Get the message body. Preference order is: html, rtf, text
+           Chunk htmlChunk = null;
+           Chunk rtfChunk = null;
+           Chunk textChunk = null;
+           for(Chunk chunk : msg.getMainChunks().getAll()) {
+              if(chunk.getChunkId() == MAPIProperty.BODY_HTML.id) {
+                 htmlChunk = chunk;
+              }
+              if(chunk.getChunkId() == MAPIProperty.RTF_COMPRESSED.id) {
+                 rtfChunk = chunk;
+              }
+              if(chunk.getChunkId() == MAPIProperty.BODY.id) {
+                 textChunk = chunk;
+              }
+           }
+           
+           boolean doneBody = false;
+           if(htmlChunk != null) {
+              byte[] data = null;
+              if(htmlChunk instanceof ByteChunk) {
+                 data = ((ByteChunk)htmlChunk).getValue();
+              } else if(htmlChunk instanceof StringChunk) {
+                 // TODO Needs POI 3.8 beta 3
+              }
+              if(data != null) {
+                 HtmlParser htmlParser = new HtmlParser();
+                 htmlParser.parse(
+                       new ByteArrayInputStream(data),
+                       xhtml, new Metadata(), new ParseContext()
+                 );
+                 doneBody = true;
+              }
+           }
+           if(rtfChunk != null && !doneBody) {
+              // TODO Needs POI 3.8 beta 2 for TNEF support
+           }
+           if(textChunk != null && !doneBody) {
+              xhtml.element("p", ((StringChunk)textChunk).getValue());
+           }
            
+           // Process the attachments
            for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
                xhtml.startElement("div", "class", "attachment-entry");