You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2011/04/01 16:51:45 UTC

svn commit: r1087726 - in /poi/trunk: src/documentation/content/xdocs/ src/scratchpad/src/org/apache/poi/hsmf/ src/scratchpad/src/org/apache/poi/hsmf/datatypes/ src/scratchpad/testcases/org/apache/poi/hsmf/ test-data/hsmf/

Author: nick
Date: Fri Apr  1 14:51:45 2011
New Revision: 1087726

URL: http://svn.apache.org/viewvc?rev=1087726&view=rev
Log:
Improve HSMF encoding guessing for 7 bit fields, and allow HSMF access to the HTML body contents in MAPIMessage

Added:
    poi/trunk/test-data/hsmf/chinese-traditional.msg   (with props)
Modified:
    poi/trunk/src/documentation/content/xdocs/status.xml
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java

Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=1087726&r1=1087725&r2=1087726&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Fri Apr  1 14:51:45 2011
@@ -33,6 +33,10 @@
     </developers>
 
     <changes>
+        <release version="3.8-beta3" date="2011-??-??">
+           <action dev="poi-developers" type="add">Improve HSMF encoding guessing for 7 bit fields in MAPIMessage</action>
+           <action dev="poi-developers" type="add">Allow HSMF access to the HTML body contents in MAPIMessage</action>
+        </release>
         <release version="3.8-beta2" date="2011-??-??">
            <action dev="poi-developers" type="add">Implement the load method on MemoryPackagePart</action>
            <action dev="poi-developers" type="add">50967 - Support for continued ExtSSTRecords</action>

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java?rev=1087726&r1=1087725&r2=1087726&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java Fri Apr  1 14:51:45 2011
@@ -177,6 +177,16 @@ public class MAPIMessage extends POIDocu
    }
 
    /**
+    * Gets the html body of this Outlook Message, if this email
+    *  contains a html version.
+    * @return The string representation of the 'html' version of the body, if available.
+    * @throws ChunkNotFoundException
+    */
+   public String getHmtlBody() throws ChunkNotFoundException {
+      return getStringFromChunk(mainChunks.htmlBodyChunk);
+   }
+
+   /**
     * Gets the subject line of the Outlook Message
     * @throws ChunkNotFoundException
     */
@@ -331,28 +341,59 @@ public class MAPIMessage extends POIDocu
                if(m.matches()) {
                   // Found it! Tell all the string chunks
                   String charset = m.group(1);
-                  
-                  for(Chunk c : mainChunks.getAll()) {
-                     if(c instanceof StringChunk) {
-                        ((StringChunk)c).set7BitEncoding(charset);
-                     }
-                  }
-                  for(Chunk c : nameIdChunks.getAll()) {
-                     if(c instanceof StringChunk) {
-                        ((StringChunk)c).set7BitEncoding(charset);
-                     }
-                  }
-                  for(RecipientChunks rc : recipientChunks) {
-                     for(Chunk c : rc.getAll()) {
-                        if(c instanceof StringChunk) {
-                           ((StringChunk)c).set7BitEncoding(charset);
-                        }
-                     }
-                  }
+                  set7BitEncoding(charset);
+                  return;
                }
             }
          }
       } catch(ChunkNotFoundException e) {}
+      
+      // Nothing suitable in the headers, try HTML
+      try {
+         String html = getHmtlBody();
+         
+         // Look for a content type in the meta headers
+         Pattern p = Pattern.compile(
+               "<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\""
+         );
+         Matcher m = p.matcher(html);
+         if(m.find()) {
+            // Found it! Tell all the string chunks
+            String charset = m.group(1);
+            set7BitEncoding(charset);
+            return;
+         }
+      } catch(ChunkNotFoundException e) {}
+   }
+
+   /**
+    * Many messages store their strings as unicode, which is
+    *  nice and easy. Some use one-byte encodings for their
+    *  strings, but don't easily store the encoding anywhere
+    *  in the file!
+    * If you know what the encoding is of your file, you can
+    *  use this method to set the 7 bit encoding for all
+    *  the non unicode strings in the file.
+    * @see #guess7BitEncoding()
+    */
+   public void set7BitEncoding(String charset) {
+      for(Chunk c : mainChunks.getAll()) {
+         if(c instanceof StringChunk) {
+            ((StringChunk)c).set7BitEncoding(charset);
+         }
+      }
+      for(Chunk c : nameIdChunks.getAll()) {
+         if(c instanceof StringChunk) {
+            ((StringChunk)c).set7BitEncoding(charset);
+         }
+      }
+      for(RecipientChunks rc : recipientChunks) {
+         for(Chunk c : rc.getAll()) {
+            if(c instanceof StringChunk) {
+               ((StringChunk)c).set7BitEncoding(charset);
+            }
+         }
+      }
    }
    
    /**

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java?rev=1087726&r1=1087725&r2=1087726&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java Fri Apr  1 14:51:45 2011
@@ -37,6 +37,8 @@ public final class Chunks implements Chu
    public StringChunk messageClass;
    /** BODY Chunk, for plain/text messages */
    public StringChunk textBodyChunk;
+   /** BODY Html Chunk, for html messages */
+   public StringChunk htmlBodyChunk;
    /** Subject link chunk, in plain/text */
    public StringChunk subjectChunk;
    /** Value that is in the TO field (not actually the addresses as they are stored in recip directory nodes */
@@ -117,6 +119,10 @@ public final class Chunks implements Chu
       else if(chunk.getChunkId() == MAPIProperty.BODY.id) {
          textBodyChunk = (StringChunk)chunk;
       }
+      else if(chunk.getChunkId() == MAPIProperty.BODY_HTML.id && 
+              chunk instanceof StringChunk) {
+         htmlBodyChunk = (StringChunk)chunk;
+      }
       
       // And add to the main list
       allChunks.add(chunk);

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java?rev=1087726&r1=1087725&r2=1087726&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java Fri Apr  1 14:51:45 2011
@@ -30,8 +30,11 @@ import org.apache.poi.util.StringUtil;
  * A Chunk made up of a single string.
  */
 public class StringChunk extends Chunk {
-	private String value;
-	private String encoding7Bit = "CP1252";
+   private static final String DEFAULT_ENCODING = "CP1252"; 
+   private String encoding7Bit = DEFAULT_ENCODING;
+   private String value;
+   /** Only kept around for 7 bit strings */
+   private byte[] rawValue;
 
 	/**
 	 * Creates a String Chunk.
@@ -56,7 +59,7 @@ public class StringChunk extends Chunk {
 	public String get7BitEncoding() {
 	   return encoding7Bit;
 	}
-	
+
 	/**
 	 * Sets the Encoding that will be used to
 	 *  decode any "7 bit" (non unicode) data.
@@ -66,25 +69,33 @@ public class StringChunk extends Chunk {
 	 */
 	public void set7BitEncoding(String encoding) {
 	   this.encoding7Bit = encoding;
+
+	   // Re-read the String if we're a 7 bit one
+	   if(type == Types.ASCII_STRING) {
+	      parseString(rawValue);
+	   }
 	}
-	
+
 	public void readValue(InputStream value) throws IOException {
-      String tmpValue;
-      byte[] data = IOUtils.toByteArray(value);
-      
+	   byte[] data = IOUtils.toByteArray(value);
+	   parseString(data);
+	}
+	private void parseString(byte[] data) {
+	   String tmpValue;
 	   switch(type) {
 	   case Types.ASCII_STRING:
 	      tmpValue = parseAs7BitData(data, encoding7Bit);
-         break;
+	      this.rawValue = data;
+	      break;
 	   case Types.UNICODE_STRING:
 	      tmpValue = StringUtil.getFromUnicodeLE(data);
 	      break;
 	   default:
 	      throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
 	   }
-	   
+
 	   // Clean up
-		this.value = tmpValue.replace("\0", "");
+	   this.value = tmpValue.replace("\0", "");
 	}
 	
 	public void writeValue(OutputStream out) throws IOException {
@@ -121,7 +132,7 @@ public class StringChunk extends Chunk {
     *  and returns the string that that yields.
     */
    protected static String parseAs7BitData(byte[] data) {
-      return parseAs7BitData(data, "CP1252");
+      return parseAs7BitData(data, DEFAULT_ENCODING);
    }
    /**
     * Parses as non-unicode, supposedly 7 bit data

Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java?rev=1087726&r1=1087725&r2=1087726&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java Fri Apr  1 14:51:45 2011
@@ -35,6 +35,7 @@ public final class TestBasics extends Te
    private MAPIMessage attachments;
    private MAPIMessage noRecipientAddress;
    private MAPIMessage cyrillic;
+   private MAPIMessage chinese;
 
 	/**
 	 * Initialize this test, load up the blank.msg mapi message.
@@ -48,6 +49,7 @@ public final class TestBasics extends Te
       attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg"));
       noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg"));
       cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg"));
+      chinese = new MAPIMessage(samples.openResourceAsStream("chinese-traditional.msg"));
 	}
 	
 	/**
@@ -195,5 +197,27 @@ public final class TestBasics extends Te
       
       assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
       assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+      
+      // Override it, check it's taken
+      cyrillic.set7BitEncoding("UTF-8");
+      assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+      assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+      
+      
+      // Check with a file that has no headers
+      try {
+         chinese.getHeaders();
+         fail("File doesn't have headers!");
+      } catch(ChunkNotFoundException e) {}
+      
+      String html = chinese.getHmtlBody();
+      assertTrue("Charset not found:\n" + html, html.contains("text/html; charset=big5"));
+      
+      // Defaults to CP1251
+      assertEquals("CP1252", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+      
+      // But after guessing goes to the correct one, Big 5
+      chinese.guess7BitEncoding();
+      assertEquals("big5", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
    }
 }

Added: poi/trunk/test-data/hsmf/chinese-traditional.msg
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/hsmf/chinese-traditional.msg?rev=1087726&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/hsmf/chinese-traditional.msg
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org