You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by ni...@apache.org on 2010/08/03 18:06:21 UTC

svn commit: r981947 - in /poi/trunk/src: documentation/content/xdocs/status.xml scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java

Author: nick
Date: Tue Aug  3 16:06:21 2010
New Revision: 981947

URL: http://svn.apache.org/viewvc?rev=981947&view=rev
Log:
Fix bug #49441 - Allow overriding and guessing of HSMF non-unicode string encodings

Modified:
    poi/trunk/src/documentation/content/xdocs/status.xml
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java

Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=981947&r1=981946&r2=981947&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Tue Aug  3 16:06:21 2010
@@ -34,6 +34,7 @@
 
     <changes>
         <release version="3.7-beta2" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="add">49441 - Allow overriding and guessing of HSMF non-unicode string encodings</action>
            <action dev="POI-DEVELOPERS" type="fix">49689 - Allow the setting of user style names on newly created HSSF cell styles</action>
            <action dev="POI-DEVELOPERS" type="add">Make it easier to tell which content types each POIXMLTextExtractor handles</action>
            <action dev="POI-DEVELOPERS" type="fix">49649 - Added clone support for UserSView* and Feat* families of records</action>

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java?rev=981947&r1=981946&r2=981947&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java Tue Aug  3 16:06:21 2010
@@ -25,10 +25,13 @@ import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Calendar;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.poi.POIDocument;
 import org.apache.poi.hsmf.datatypes.AttachmentChunks;
 import org.apache.poi.hsmf.datatypes.AttachmentChunks.AttachmentChunksSorter;
+import org.apache.poi.hsmf.datatypes.Chunk;
 import org.apache.poi.hsmf.datatypes.ChunkGroup;
 import org.apache.poi.hsmf.datatypes.Chunks;
 import org.apache.poi.hsmf.datatypes.NameIdChunks;
@@ -286,10 +289,58 @@ public class MAPIMessage extends POIDocu
 
       return names;
    }
+   
+   /**
+    * Many messages store their strings as unicode, which is
+    *  nice and easy. Some use one-byte encodings for their
+    *  strings, but don't easily store the encoding anywhere
+    *  in the file!
+    * This method looks at the headers for the message, and
+    *  tries to use these to guess the correct encoding for
+    *  your file.
+    * Bug #49441 has more on why this is needed
+    */
+   public void guess7BitEncoding() {
+      try {
+         String[] headers = getHeaders();
+         if(headers == null || headers.length == 0) {
+            return;
+         }
 
+         // Look for a content type with a charset
+         Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?(.*?)[\"']?");
+         for(String header : headers) {
+            if(header.startsWith("Content-Type")) {
+               Matcher m = p.matcher(header);
+               if(m.matches()) {
+                  // Found it! Tell all the string chunks
+                  String charset = m.group(1);
+                  
+                  for(Chunk c : mainChunks.getAll()) {
+                     if(c instanceof StringChunk) {
+                        ((StringChunk)c).set7BitEncoding(charset);
+                     }
+                  }
+                  for(Chunk c : nameIdChunks.getAll()) {
+                     if(c instanceof StringChunk) {
+                        ((StringChunk)c).set7BitEncoding(charset);
+                     }
+                  }
+                  for(RecipientChunks rc : recipientChunks) {
+                     for(Chunk c : rc.getAll()) {
+                        if(c instanceof StringChunk) {
+                           ((StringChunk)c).set7BitEncoding(charset);
+                        }
+                     }
+                  }
+               }
+            }
+         }
+      } catch(ChunkNotFoundException e) {}
+   }
    
    /**
-    * 
+    * Returns all the headers, one entry per line
     */
    public String[] getHeaders() throws ChunkNotFoundException {
       String headers = getStringFromChunk(mainChunks.messageHeaders);

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java?rev=981947&r1=981946&r2=981947&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java Tue Aug  3 16:06:21 2010
@@ -30,8 +30,8 @@ import org.apache.poi.util.StringUtil;
  * A Chunk made up of a single string.
  */
 public class StringChunk extends Chunk {
-
 	private String value;
+	private String encoding7Bit = "CP1252";
 
 	/**
 	 * Creates a String Chunk.
@@ -48,13 +48,33 @@ public class StringChunk extends Chunk {
 	   super(chunkId, type);
 	}
 	
+	/**
+	 * Returns the Encoding that will be used to
+	 *  decode any "7 bit" (non unicode) data.
+	 * Most files default to CP1252
+	 */
+	public String get7BitEncoding() {
+	   return encoding7Bit;
+	}
+	
+	/**
+	 * Sets the Encoding that will be used to
+	 *  decode any "7 bit" (non unicode) data.
+	 * This doesn't appear to be stored anywhere
+	 *  specific in the file, so you may need
+	 *  to guess by looking at headers etc
+	 */
+	public void set7BitEncoding(String encoding) {
+	   this.encoding7Bit = encoding;
+	}
+	
 	public void readValue(InputStream value) throws IOException {
       String tmpValue;
       byte[] data = IOUtils.toByteArray(value);
       
 	   switch(type) {
 	   case Types.ASCII_STRING:
-	      tmpValue = parseAs7BitData(data);
+	      tmpValue = parseAs7BitData(data, encoding7Bit);
          break;
 	   case Types.UNICODE_STRING:
 	      tmpValue = StringUtil.getFromUnicodeLE(data);
@@ -73,9 +93,9 @@ public class StringChunk extends Chunk {
       switch(type) {
       case Types.ASCII_STRING:
          try {
-            data = value.getBytes("CP1252");
+            data = value.getBytes(encoding7Bit);
          } catch (UnsupportedEncodingException e) {
-            throw new RuntimeException("Core encoding not found, JVM broken?", e);
+            throw new RuntimeException("Encoding not found - " + encoding7Bit, e);
          }
          break;
       case Types.UNICODE_STRING:
@@ -101,10 +121,17 @@ public class StringChunk extends Chunk {
     *  and returns the string that that yields.
     */
    protected static String parseAs7BitData(byte[] data) {
+      return parseAs7BitData(data, "CP1252");
+   }
+   /**
+    * Parses as non-unicode, supposedly 7 bit data
+    *  and returns the string that that yields.
+    */
+   protected static String parseAs7BitData(byte[] data, String encoding) {
       try {
-         return new String(data, "CP1252");
+         return new String(data, encoding);
       } catch (UnsupportedEncodingException e) {
-         throw new RuntimeException("Core encoding not found, JVM broken?", e);
+         throw new RuntimeException("Encoding not found - " + encoding, e);
       }
    }
 }

Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java?rev=981947&r1=981946&r2=981947&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java Tue Aug  3 16:06:21 2010
@@ -34,6 +34,7 @@ public final class TestBasics extends Te
    private MAPIMessage outlook30;
    private MAPIMessage attachments;
    private MAPIMessage noRecipientAddress;
+   private MAPIMessage cyrillic;
 
 	/**
 	 * Initialize this test, load up the blank.msg mapi message.
@@ -46,6 +47,7 @@ public final class TestBasics extends Te
       outlook30  = new MAPIMessage(samples.openResourceAsStream("outlook_30_msg.msg"));
       attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg"));
       noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg"));
+      cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg"));
 	}
 	
 	/**
@@ -177,4 +179,21 @@ public final class TestBasics extends Te
       
       noRecipientAddress.setReturnNullOnMissingChunk(false);
 	}
+	
+   /**
+    * We default to CP1252, but can sometimes do better
+    *  if needed.
+    * This file is really CP1251, according to the person
+    *  who submitted it in bug #49441
+    */
+   public void testEncoding() throws Exception {
+      assertEquals(2, cyrillic.getRecipientDetailsChunks().length);
+      assertEquals("CP1252", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+      assertEquals("CP1252", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+      
+      cyrillic.guess7BitEncoding();
+      
+      assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+      assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+   }
 }



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org