You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@poi.apache.org by fa...@apache.org on 2019/05/26 09:43:59 UTC

svn commit: r1860043 - in /poi/trunk: src/java/org/apache/poi/util/ src/scratchpad/src/org/apache/poi/hsmf/ src/scratchpad/src/org/apache/poi/hsmf/datatypes/ src/scratchpad/testcases/org/apache/poi/hsmf/ test-data/hsmf/

Author: fanningpj
Date: Sun May 26 09:43:59 2019
New Revision: 1860043

URL: http://svn.apache.org/viewvc?rev=1860043&view=rev
Log:
[github-149] improve MAPIMessage.guess7BitEncoding, improve MAPIMessage.getHtmlBody. Thanks to Dominik Hölzl. This closes #149

Added:
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java   (with props)
    poi/trunk/test-data/hsmf/ASCII_CP1251_LCID1049.msg   (with props)
    poi/trunk/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031.msg   (with props)
    poi/trunk/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031_HTML.msg   (with props)
    poi/trunk/test-data/hsmf/HTMLBodyBinary_CP1251.msg   (with props)
    poi/trunk/test-data/hsmf/HTMLBodyBinary_UTF-8.msg   (with props)
Modified:
    poi/trunk/src/java/org/apache/poi/util/LocaleUtil.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
    poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java
    poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java

Modified: poi/trunk/src/java/org/apache/poi/util/LocaleUtil.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/java/org/apache/poi/util/LocaleUtil.java?rev=1860043&r1=1860042&r2=1860043&view=diff
==============================================================================
--- poi/trunk/src/java/org/apache/poi/util/LocaleUtil.java (original)
+++ poi/trunk/src/java/org/apache/poi/util/LocaleUtil.java Sun May 26 09:43:59 2019
@@ -616,5 +616,473 @@ public final class LocaleUtil {
         }
     }
     
+    /**
+     * Get default code page from LCID value
+     *
+     * @param lcid the LCID value
+     * @return the default code page
+     */
+    public static int getDefaultCodePageFromLCID(int lcid) {
+        int languageId = lcid & 0xFFFF;
+        switch (languageId) {
+        case 0x0001: return 1256;
+        case 0x0002: return 1251;
+        case 0x0003: return 1252;
+        case 0x0004: return 936;
+        case 0x0005: return 1250;
+        case 0x0006: return 1252;
+        case 0x0007: return 1252;
+        case 0x0008: return 1253;
+        case 0x0009: return 1252;
+        case 0x000a: return 1252;
+        case 0x000b: return 1252;
+        case 0x000c: return 1252;
+        case 0x000d: return 1255;
+        case 0x000e: return 1250;
+        case 0x000f: return 1252;
+        case 0x0010: return 1252;
+        case 0x0011: return 932;
+        case 0x0012: return 949;
+        case 0x0013: return 1252;
+        case 0x0014: return 1252;
+        case 0x0015: return 1250;
+        case 0x0016: return 1252;
+        case 0x0017: return 1252;
+        case 0x0018: return 1250;
+        case 0x0019: return 1251;
+        case 0x001a: return 1250;
+        case 0x001b: return 1250;
+        case 0x001c: return 1250;
+        case 0x001d: return 1252;
+        case 0x001e: return 874;
+        case 0x001f: return 1254;
+        case 0x0020: return 1256;
+        case 0x0021: return 1252;
+        case 0x0022: return 1251;
+        case 0x0023: return 1251;
+        case 0x0024: return 1250;
+        case 0x0025: return 1257;
+        case 0x0026: return 1257;
+        case 0x0027: return 1257;
+        case 0x0028: return 1251;
+        case 0x0029: return 1256;
+        case 0x002a: return 1258;
+        case 0x002b: return 0;
+        case 0x002c: return 1254;
+        case 0x002d: return 1252;
+        case 0x002e: return 1252;
+        case 0x002f: return 1251;
+        case 0x0030: return 0;
+        case 0x0031: return 0;
+        case 0x0032: return 1252;
+        case 0x0033: return 32759;
+        case 0x0034: return 1252;
+        case 0x0035: return 1252;
+        case 0x0036: return 1252;
+        case 0x0037: return 0;
+        case 0x0038: return 1252;
+        case 0x0039: return 0;
+        case 0x003a: return 0;
+        case 0x003b: return 1252;
+        case 0x003c: return 1252;
+        case 0x003d: return 32759;
+        case 0x003e: return 1252;
+        case 0x003f: return 0;
+        case 0x0040: return 1251;
+        case 0x0041: return 1252;
+        case 0x0042: return 1250;
+        case 0x0043: return 1254;
+        case 0x0044: return 1251;
+        case 0x0045: return 0;
+        case 0x0046: return 0;
+        case 0x0047: return 0;
+        case 0x0048: return 0;
+        case 0x0049: return 0;
+        case 0x004a: return 0;
+        case 0x004b: return 0;
+        case 0x004c: return 0;
+        case 0x004d: return 0;
+        case 0x004e: return 0;
+        case 0x004f: return 0;
+        case 0x0050: return 1251;
+        case 0x0051: return 0;
+        case 0x0052: return 1252;
+        case 0x0053: return 0;
+        case 0x0054: return 0;
+        case 0x0055: return 0;
+        case 0x0056: return 1252;
+        case 0x0057: return 0;
+        case 0x0058: return 32759;
+        case 0x0059: return 1256;
+        case 0x005a: return 0;
+        case 0x005b: return 0;
+        case 0x005c: return 0;
+        case 0x005d: return 1252;
+        case 0x005e: return 0;
+        case 0x005f: return 1252;
+        case 0x0060: return 32759;
+        case 0x0061: return 0;
+        case 0x0062: return 1252;
+        case 0x0063: return 0;
+        case 0x0064: return 1252;
+        case 0x0065: return 0;
+        case 0x0066: return 32759;
+        case 0x0067: return 1252;
+        case 0x0068: return 1252;
+        case 0x0069: return 32759;
+        case 0x006a: return 1252;
+        case 0x006b: return 1252;
+        case 0x006c: return 1252;
+        case 0x006d: return 1251;
+        case 0x006e: return 1252;
+        case 0x006f: return 1252;
+        case 0x0070: return 1252;
+        case 0x0071: return 32759;
+        case 0x0072: return 0;
+        case 0x0073: return 0;
+        case 0x0074: return 1252;
+        case 0x0075: return 1252;
+        case 0x0076: return 32759;
+        case 0x0077: return 0;
+        case 0x0078: return 0;
+        case 0x0079: return 32759;
+        case 0x007a: return 1252;
+        case 0x007b: return 32759;
+        case 0x007c: return 1252;
+        case 0x007d: return 32759;
+        case 0x007e: return 1252;
+        case 0x007f: return 1252;
+        case 0x0080: return 1256;
+        case 0x0081: return 0;
+        case 0x0082: return 1252;
+        case 0x0083: return 1252;
+        case 0x0084: return 1252;
+        case 0x0085: return 1251;
+        case 0x0086: return 1252;
+        case 0x0087: return 1252;
+        case 0x0088: return 1252;
+        case 0x0089: return 32759;
+        case 0x008a: return 32759;
+        case 0x008b: return 32759;
+        case 0x008c: return 1256;
+        case 0x008d: return 32759;
+        case 0x008e: return 32759;
+        case 0x008f: return 32759;
+        case 0x0090: return 32759;
+        case 0x0091: return 1252;
+        case 0x0092: return 1256;
+        case 0x0093: return 32759;
+        case 0x0401: return 1256;
+        case 0x0402: return 1251;
+        case 0x0403: return 1252;
+        case 0x0404: return 950;
+        case 0x0405: return 1250;
+        case 0x0406: return 1252;
+        case 0x0407: return 1252;
+        case 0x0408: return 1253;
+        case 0x0409: return 1252;
+        case 0x040a: return 1252;
+        case 0x040b: return 1252;
+        case 0x040c: return 1252;
+        case 0x040d: return 1255;
+        case 0x040e: return 1250;
+        case 0x040f: return 1252;
+        case 0x0410: return 1252;
+        case 0x0411: return 932;
+        case 0x0412: return 949;
+        case 0x0413: return 1252;
+        case 0x0414: return 1252;
+        case 0x0415: return 1250;
+        case 0x0416: return 1252;
+        case 0x0417: return 1252;
+        case 0x0418: return 1250;
+        case 0x0419: return 1251;
+        case 0x041a: return 1250;
+        case 0x041b: return 1250;
+        case 0x041c: return 1250;
+        case 0x041d: return 1252;
+        case 0x041e: return 874;
+        case 0x041f: return 1254;
+        case 0x0420: return 1256;
+        case 0x0421: return 1252;
+        case 0x0422: return 1251;
+        case 0x0423: return 1251;
+        case 0x0424: return 1250;
+        case 0x0425: return 1257;
+        case 0x0426: return 1257;
+        case 0x0427: return 1257;
+        case 0x0428: return 1251;
+        case 0x0429: return 1256;
+        case 0x042a: return 1258;
+        case 0x042b: return 0;
+        case 0x042c: return 1254;
+        case 0x042d: return 1252;
+        case 0x042e: return 1252;
+        case 0x042f: return 1251;
+        case 0x0430: return 0;
+        case 0x0431: return 0;
+        case 0x0432: return 1252;
+        case 0x0433: return 32759;
+        case 0x0434: return 1252;
+        case 0x0435: return 1252;
+        case 0x0436: return 1252;
+        case 0x0437: return 0;
+        case 0x0438: return 1252;
+        case 0x0439: return 0;
+        case 0x043a: return 0;
+        case 0x043b: return 1252;
+        case 0x043d: return 32759;
+        case 0x043e: return 1252;
+        case 0x043f: return 0;
+        case 0x0440: return 1251;
+        case 0x0441: return 1252;
+        case 0x0442: return 1250;
+        case 0x0443: return 1254;
+        case 0x0444: return 1251;
+        case 0x0445: return 0;
+        case 0x0446: return 0;
+        case 0x0447: return 0;
+        case 0x0448: return 0;
+        case 0x0449: return 0;
+        case 0x044a: return 0;
+        case 0x044b: return 0;
+        case 0x044c: return 0;
+        case 0x044d: return 0;
+        case 0x044e: return 0;
+        case 0x044f: return 0;
+        case 0x0450: return 1251;
+        case 0x0451: return 0;
+        case 0x0452: return 1252;
+        case 0x0453: return 0;
+        case 0x0454: return 0;
+        case 0x0455: return 0;
+        case 0x0456: return 1252;
+        case 0x0457: return 0;
+        case 0x0458: return 32759;
+        case 0x0459: return 32759;
+        case 0x045a: return 0;
+        case 0x045b: return 0;
+        case 0x045c: return 0;
+        case 0x045d: return 0;
+        case 0x045e: return 0;
+        case 0x045f: return 32759;
+        case 0x0460: return 32759;
+        case 0x0461: return 0;
+        case 0x0462: return 1252;
+        case 0x0463: return 0;
+        case 0x0464: return 1252;
+        case 0x0465: return 0;
+        case 0x0466: return 32759;
+        case 0x0467: return 32759;
+        case 0x0468: return 1252;
+        case 0x0469: return 32759;
+        case 0x046a: return 1252;
+        case 0x046b: return 1252;
+        case 0x046c: return 1252;
+        case 0x046d: return 1251;
+        case 0x046e: return 1252;
+        case 0x046f: return 1252;
+        case 0x0470: return 1252;
+        case 0x0471: return 32759;
+        case 0x0472: return 0;
+        case 0x0473: return 0;
+        case 0x0474: return 1252;
+        case 0x0475: return 1252;
+        case 0x0476: return 32759;
+        case 0x0477: return 0;
+        case 0x0478: return 0;
+        case 0x0479: return 32759;
+        case 0x047a: return 1252;
+        case 0x047c: return 1252;
+        case 0x047e: return 1252;
+        case 0x0480: return 1256;
+        case 0x0481: return 0;
+        case 0x0482: return 1252;
+        case 0x0483: return 1252;
+        case 0x0484: return 1252;
+        case 0x0485: return 1251;
+        case 0x0486: return 1252;
+        case 0x0487: return 1252;
+        case 0x0488: return 1252;
+        case 0x048c: return 1256;
+        case 0x048d: return 32759;
+        case 0x048e: return 32759;
+        case 0x048f: return 32759;
+        case 0x0490: return 32759;
+        case 0x0491: return 1252;
+        case 0x0492: return 1256;
+        case 0x0493: return 32759;
+        case 0x0501: return 1250;
+        case 0x05fe: return 932;
+        case 0x0801: return 1256;
+        case 0x0803: return 1252;
+        case 0x0804: return 936;
+        case 0x0807: return 1252;
+        case 0x0809: return 1252;
+        case 0x080a: return 1252;
+        case 0x080c: return 1252;
+        case 0x0810: return 1252;
+        case 0x0811: return 32759;
+        case 0x0813: return 1252;
+        case 0x0814: return 1252;
+        case 0x0816: return 1252;
+        case 0x0818: return 0;
+        case 0x0819: return 32759;
+        case 0x081a: return 1250;
+        case 0x081d: return 1252;
+        case 0x0820: return 0;
+        case 0x0827: return 32759;
+        case 0x082c: return 1251;
+        case 0x082e: return 1252;
+        case 0x0832: return 1252;
+        case 0x083b: return 1252;
+        case 0x083c: return 1252;
+        case 0x083e: return 1252;
+        case 0x0843: return 1251;
+        case 0x0845: return 0;
+        case 0x0846: return 1256;
+        case 0x0849: return 0;
+        case 0x0850: return 0;
+        case 0x0851: return 32759;
+        case 0x0859: return 1256;
+        case 0x085d: return 1252;
+        case 0x085f: return 1252;
+        case 0x0860: return 32759;
+        case 0x0861: return 0;
+        case 0x0867: return 1252;
+        case 0x086b: return 1252;
+        case 0x0873: return 0;
+        case 0x09ff: return 1256;
+        case 0x0c01: return 1256;
+        case 0x0c04: return 950;
+        case 0x0c07: return 1252;
+        case 0x0c09: return 1252;
+        case 0x0c0a: return 1252;
+        case 0x0c0c: return 1252;
+        case 0x0c1a: return 1251;
+        case 0x0c3b: return 1252;
+        case 0x0c5f: return 32759;
+        case 0x0c6b: return 1252;
+        case 0x1001: return 1256;
+        case 0x1004: return 936;
+        case 0x1007: return 1252;
+        case 0x1009: return 1252;
+        case 0x100a: return 1252;
+        case 0x100c: return 1252;
+        case 0x101a: return 1250;
+        case 0x103b: return 1252;
+        case 0x1401: return 1256;
+        case 0x1404: return 950;
+        case 0x1407: return 1252;
+        case 0x1409: return 1252;
+        case 0x140a: return 1252;
+        case 0x140c: return 1252;
+        case 0x141a: return 1250;
+        case 0x143b: return 1252;
+        case 0x1801: return 1256;
+        case 0x1809: return 1252;
+        case 0x180a: return 1252;
+        case 0x180c: return 1252;
+        case 0x181a: return 1250;
+        case 0x183b: return 1252;
+        case 0x1c01: return 1256;
+        case 0x1c09: return 1252;
+        case 0x1c0a: return 1252;
+        case 0x1c0c: return 32759;
+        case 0x1c1a: return 1251;
+        case 0x1c3b: return 1252;
+        case 0x2001: return 1256;
+        case 0x2008: return 32759;
+        case 0x2009: return 1252;
+        case 0x200a: return 1252;
+        case 0x200c: return 0;
+        case 0x201a: return 1251;
+        case 0x203b: return 1252;
+        case 0x2401: return 1256;
+        case 0x2409: return 1252;
+        case 0x240a: return 1252;
+        case 0x240c: return 0;
+        case 0x241a: return 1250;
+        case 0x243b: return 1252;
+        case 0x2801: return 1256;
+        case 0x2809: return 1252;
+        case 0x280a: return 1252;
+        case 0x280c: return 0;
+        case 0x281a: return 1251;
+        case 0x2c01: return 1256;
+        case 0x2c09: return 1252;
+        case 0x2c0a: return 1252;
+        case 0x2c0c: return 0;
+        case 0x2c1a: return 1250;
+        case 0x3001: return 1256;
+        case 0x3009: return 1252;
+        case 0x300a: return 1252;
+        case 0x300c: return 0;
+        case 0x301a: return 1251;
+        case 0x3401: return 1256;
+        case 0x3409: return 1252;
+        case 0x340a: return 1252;
+        case 0x340c: return 0;
+        case 0x3801: return 1256;
+        case 0x3809: return 32759;
+        case 0x380a: return 1252;
+        case 0x380c: return 0;
+        case 0x3c01: return 1256;
+        case 0x3c09: return 0;
+        case 0x3c0a: return 1252;
+        case 0x3c0c: return 0;
+        case 0x4001: return 1256;
+        case 0x4009: return 1252;
+        case 0x400a: return 1252;
+        case 0x4401: return 32759;
+        case 0x4409: return 1252;
+        case 0x440a: return 1252;
+        case 0x4801: return 32759;
+        case 0x4809: return 1252;
+        case 0x480a: return 1252;
+        case 0x4c09: return 32759;
+        case 0x4c0a: return 1252;
+        case 0x5009: return 32759;
+        case 0x500a: return 1252;
+        case 0x5409: return 32759;
+        case 0x540a: return 1252;
+        case 0x5809: return 32759;
+        case 0x5c09: return 32759;
+        case 0x6009: return 32759;
+        case 0x6409: return 32759;
+        case 0x641a: return 1251;
+        case 0x681a: return 1250;
+        case 0x6c1a: return 1251;
+        case 0x701a: return 1250;
+        case 0x703b: return 1252;
+        case 0x742c: return 1251;
+        case 0x743b: return 1252;
+        case 0x7804: return 936;
+        case 0x7814: return 1252;
+        case 0x781a: return 1250;
+        case 0x782c: return 1254;
+        case 0x783b: return 1252;
+        case 0x7843: return 1251;
+        case 0x7850: return 1251;
+        case 0x785d: return 0;
+        case 0x7c04: return 950;
+        case 0x7c14: return 1252;
+        case 0x7c1a: return 1250;
+        case 0x7c28: return 1251;
+        case 0x7c2e: return 1252;
+        case 0x7c3b: return 1252;
+        case 0x7c43: return 1254;
+        case 0x7c46: return 1256;
+        case 0x7c50: return 0;
+        case 0x7c59: return 1256;
+        case 0x7c5c: return 0;
+        case 0x7c5d: return 1252;
+        case 0x7c5f: return 1252;
+        case 0x7c67: return 1252;
+        case 0x7c68: return 1252;
+        case 0x7c92: return 1256;
+        default: return 0;
+        }
+    }
 }
 

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java?rev=1860043&r1=1860042&r2=1860043&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java Sun May 26 09:43:59 2019
@@ -50,6 +50,7 @@ import org.apache.poi.hsmf.parsers.POIFS
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.CodePageUtil;
+import org.apache.poi.util.LocaleUtil;
 import org.apache.poi.util.POILogFactory;
 import org.apache.poi.util.POILogger;
 
@@ -210,8 +211,21 @@ public class MAPIMessage extends POIRead
     *       returnNullOnMissingChunk is set
     */
    public String getHtmlBody() throws ChunkNotFoundException {
-      if(mainChunks.getHtmlBodyChunkBinary() != null) {
-         return mainChunks.getHtmlBodyChunkBinary().getAs7bitString();
+      ByteChunk htmlBodyBinaryChunk = mainChunks.getHtmlBodyChunkBinary();
+      if (htmlBodyBinaryChunk != null) {
+         List<PropertyValue> cpid = mainChunks.getProperties().get(MAPIProperty.INTERNET_CPID);
+         if (cpid != null && cpid.size() > 0) {
+            int codepage = ((LongPropertyValue) cpid.get(0)).getValue();
+            try {
+               String encoding = CodePageUtil.codepageToEncoding(codepage, true);
+               byte[] htmlBodyBinary = htmlBodyBinaryChunk.getValue();
+               return new String(htmlBodyBinary, encoding);
+            } catch (UnsupportedEncodingException e) {
+               logger.log(POILogger.WARN, "HTML body binary: Invalid codepage ID ", codepage, " set for the message via ",
+                  MAPIProperty.INTERNET_CPID, ", ignoring");
+            }
+         }
+         return htmlBodyBinaryChunk.getAs7bitString();
       }
       return getStringFromChunk(mainChunks.getHtmlBodyChunkString());
    }
@@ -391,67 +405,86 @@ public class MAPIMessage extends POIRead
     * <p>Bug #49441 has more on why this is needed</p>
     */
    public void guess7BitEncoding() {
-      // First choice is a codepage property
-      for (MAPIProperty prop : new MAPIProperty[] {
-               MAPIProperty.MESSAGE_CODEPAGE,
-               MAPIProperty.INTERNET_CPID
-      }) {
-        List<PropertyValue> val = mainChunks.getProperties().get(prop);
-        if (val != null && val.size() > 0) {
-           int codepage = ((LongPropertyValue)val.get(0)).getValue();
-           try {
-               String encoding = CodePageUtil.codepageToEncoding(codepage, true);
-               set7BitEncoding(encoding);
-               return;
-            } catch(UnsupportedEncodingException e) {
-               logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, 
-                          " set for the message via ", prop, ", ignoring");
-            }
-         }
-      }
-     
-       
-      // Second choice is a charset on a content type header
-      try {
+     String generalcodepage = null;
+     String htmlbodycodepage = null;
+     String bodycodepage = null;
+     //
+     // General codepage: Message codepage property.
+     //
+     List<PropertyValue> val = mainChunks.getProperties().get(MAPIProperty.MESSAGE_CODEPAGE);
+     if (val != null && val.size() > 0) {
+       int codepage = ((LongPropertyValue) val.get(0)).getValue();
+       try {
+         String encoding = CodePageUtil.codepageToEncoding(codepage, true);
+         generalcodepage = encoding;
+       } catch (UnsupportedEncodingException e) {
+         logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, " set for the message via ",
+             MAPIProperty.MESSAGE_CODEPAGE, ", ignoring");
+       }
+     }
+     //
+     // General codepage fallback: Message locale ID property.
+     //
+     if (generalcodepage == null) {
+       val = mainChunks.getProperties().get(MAPIProperty.MESSAGE_LOCALE_ID);
+       if (val != null && val.size() > 0) {
+         int lcid = ((LongPropertyValue) val.get(0)).getValue();
+         int codepage = LocaleUtil.getDefaultCodePageFromLCID(lcid);
+         try {
+           if (codepage != 0) {
+             String encoding = CodePageUtil.codepageToEncoding(codepage, true);
+             generalcodepage = encoding;
+           }
+         } catch (UnsupportedEncodingException e) {
+           logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, "from locale ID", lcid, " set for the message via ",
+               MAPIProperty.MESSAGE_LOCALE_ID, ", ignoring");
+         }
+       }
+     }
+     //
+     // General codepage fallback: Charset on a content type header.
+     //
+     if (generalcodepage == null) {
+       try {
          String[] headers = getHeaders();
-         if(headers != null && headers.length > 0) {
-            // Look for a content type with a charset
-            Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
-
-            for(String header : headers) {
-               if(header.startsWith("Content-Type")) {
-                  Matcher m = p.matcher(header);
-                  if(m.matches()) {
-                     // Found it! Tell all the string chunks
-                     String charset = m.group(1);
-
-                     if (!charset.equalsIgnoreCase("utf-8")) { 
-                        set7BitEncoding(charset);
-                     }
-                     return;
-                  }
+         if (headers != null && headers.length > 0) {
+           Pattern p = Pattern.compile("content-type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE);
+           for (String header : headers) {
+             if (header.toLowerCase().startsWith("content-type")) {
+               Matcher m = p.matcher(header);
+               if (m.matches()) {
+                 String encoding = m.group(1);
+                 generalcodepage = encoding;
                }
-            }
-         }
-      } catch(ChunkNotFoundException e) {}
-      
-      // Nothing suitable in the headers, try HTML
-      try {
-         String html = getHtmlBody();
-         if(html != null && html.length() > 0) {
-            // Look for a content type in the meta headers
-            Pattern p = Pattern.compile(
-                  "<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\""
-            );
-            Matcher m = p.matcher(html);
-            if(m.find()) {
-               // Found it! Tell all the string chunks
-               String charset = m.group(1);
-               set7BitEncoding(charset);
-            }
+             }
+           }
          }
-      } catch(ChunkNotFoundException e) {}
-   }
+       } catch (ChunkNotFoundException e) {
+       }
+     }
+     //
+     // HTML and text body encoding: Internet CPID property.
+     // UTF-8 is ignored for text body. This seems to be a special Outlook behavior.
+     //
+     val = mainChunks.getProperties().get(MAPIProperty.INTERNET_CPID);
+     if (val != null && val.size() > 0) {
+       int codepage = ((LongPropertyValue) val.get(0)).getValue();
+       try {
+         String encoding = CodePageUtil.codepageToEncoding(codepage, true);
+         htmlbodycodepage = encoding;
+         if (!encoding.equalsIgnoreCase("utf-8")) {
+           bodycodepage = encoding;
+         }
+       } catch (UnsupportedEncodingException e) {
+         logger.log(POILogger.WARN, "Invalid codepage ID ", codepage, " set for the message via ",
+             MAPIProperty.INTERNET_CPID, ", ignoring");
+       }
+     }
+     //
+     // Apply encoding
+     //
+     set7BitEncoding(generalcodepage, htmlbodycodepage, bodycodepage);
+  }
 
    /**
     * Many messages store their strings as unicode, which is
@@ -464,26 +497,41 @@ public class MAPIMessage extends POIRead
     * @see #guess7BitEncoding()
     */
    public void set7BitEncoding(String charset) {
+     set7BitEncoding(charset, charset, charset);
+   }
+   public void set7BitEncoding(String generalcharset, String htmlbodycharset, String bodycharset) {
       for(Chunk c : mainChunks.getChunks()) {
          if(c instanceof StringChunk) {
-            ((StringChunk)c).set7BitEncoding(charset);
-         }
-      }
-
-      if (nameIdChunks!=null) {
-         for(Chunk c : nameIdChunks.getChunks()) {
-            if(c instanceof StringChunk) {
-                ((StringChunk)c).set7BitEncoding(charset);
-            }
-         }
-      }
-
-      for(RecipientChunks rc : recipientChunks) {
-         for(Chunk c : rc.getAll()) {
-            if(c instanceof StringChunk) {
-               ((StringChunk)c).set7BitEncoding(charset);
-            }
-         }
+           if (c.getChunkId() == MAPIProperty.BODY_HTML.id) {
+             if (htmlbodycharset != null) {
+               ((StringChunk)c).set7BitEncoding(htmlbodycharset);
+             }
+           }
+           else if (c.getChunkId() == MAPIProperty.BODY.id) {
+             if (bodycharset != null) {
+               ((StringChunk)c).set7BitEncoding(bodycharset);
+             }
+           }
+           else if (generalcharset != null) {
+             ((StringChunk)c).set7BitEncoding(generalcharset);
+           }
+         }
+      }
+      if (generalcharset != null) {
+        if (nameIdChunks!=null) {
+           for(Chunk c : nameIdChunks.getChunks()) {
+              if(c instanceof StringChunk) {
+                  ((StringChunk)c).set7BitEncoding(generalcharset);
+              }
+           }
+        }
+        for(RecipientChunks rc : recipientChunks) {
+           for(Chunk c : rc.getAll()) {
+              if(c instanceof StringChunk) {
+                 ((StringChunk)c).set7BitEncoding(generalcharset);
+              }
+           }
+        }
       }
    }
    

Modified: poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java?rev=1860043&r1=1860042&r2=1860043&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java (original)
+++ poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/datatypes/MAPIProperty.java Sun May 26 09:43:59 2019
@@ -512,6 +512,8 @@ public class MAPIProperty {
         new MAPIProperty(0x1a, ASCII_STRING, "MessageClass", "PR_MESSAGE_CLASS");
     public static final MAPIProperty MESSAGE_CODEPAGE =
         new MAPIProperty(0x3ffd, Types.LONG,  "MessageCodepage", "PR_MESSAGE_CODEPAGE");
+    public static final MAPIProperty MESSAGE_LOCALE_ID =
+        new MAPIProperty(0x3ff1, Types.LONG,  "MessageLocaleId", "PR_MESSAGE_LOCALE_ID");
     public static final MAPIProperty MESSAGE_DELIVERY_ID =
         new MAPIProperty(0x1b, BINARY, "MessageDeliveryId", "PR_MESSAGE_DELIVERY_ID");
     public static final MAPIProperty MESSAGE_DELIVERY_TIME =

Modified: poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java?rev=1860043&r1=1860042&r2=1860043&view=diff
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java (original)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/AllHSMFTests.java Sun May 26 09:43:59 2019
@@ -39,7 +39,8 @@ import org.junit.runners.Suite;
     TestPOIFSChunkParser.class,
     TestMessageSubmissionChunkY2KRead.class,
     TestMessageSubmissionChunk.class,
-    TestExtractEmbeddedMSG.class
+    TestExtractEmbeddedMSG.class,
+    Test7BitCodepage.class
 })
 public class AllHSMFTests {
 }

Added: poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java
URL: http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java?rev=1860043&view=auto
==============================================================================
--- poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java (added)
+++ poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java Sun May 26 09:43:59 2019
@@ -0,0 +1,85 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+
+package org.apache.poi.hsmf;
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.poi.POIDataSamples;
+
+/**
+ * Tests to verify if code page for general properties like subject,
+ * text body and html body is evaluated correctly.
+ */
+public final class Test7BitCodepage extends TestCase {
+   private final MAPIMessage ascii_cp1251_lcid1049;
+   private final MAPIMessage ascii_utf_8_cp1252_lcid1031;
+   private final MAPIMessage ascii_utf_8_cp1252_lcid1031_html;
+   private final MAPIMessage htmlbodybinary_cp1251;
+   private final MAPIMessage htmlbodybinary_utf_8;
+
+   /**
+    * Initialize this test, load up the messages.
+    * @throws Exception
+    */
+   public Test7BitCodepage() throws IOException {
+       POIDataSamples samples = POIDataSamples.getHSMFInstance();
+       ascii_cp1251_lcid1049 = new MAPIMessage(samples.openResourceAsStream("ASCII_CP1251_LCID1049.msg"));
+       ascii_utf_8_cp1252_lcid1031  = new MAPIMessage(samples.openResourceAsStream("ASCII_UTF-8_CP1252_LCID1031.msg"));
+       ascii_utf_8_cp1252_lcid1031_html  = new MAPIMessage(samples.openResourceAsStream("ASCII_UTF-8_CP1252_LCID1031_HTML.msg"));
+       htmlbodybinary_cp1251 = new MAPIMessage(samples.openResourceAsStream("HTMLBodyBinary_CP1251.msg"));
+       htmlbodybinary_utf_8 = new MAPIMessage(samples.openResourceAsStream("HTMLBodyBinary_UTF-8.msg"));
+   }
+
+   /**
+    * Evaluate encoding and check if the subject, text body and html body is decoded correctly.
+    */
+   public void test7BitEncoding() throws Exception {
+       ascii_cp1251_lcid1049.guess7BitEncoding();
+       ascii_cp1251_lcid1049.setReturnNullOnMissingChunk(true);
+       ascii_utf_8_cp1252_lcid1031.guess7BitEncoding();
+       ascii_utf_8_cp1252_lcid1031.setReturnNullOnMissingChunk(true);
+       ascii_utf_8_cp1252_lcid1031_html.guess7BitEncoding();
+       ascii_utf_8_cp1252_lcid1031_html.setReturnNullOnMissingChunk(true);
+       htmlbodybinary_cp1251.guess7BitEncoding();
+       htmlbodybinary_cp1251.setReturnNullOnMissingChunk(true);
+       htmlbodybinary_utf_8.guess7BitEncoding();
+       htmlbodybinary_utf_8.setReturnNullOnMissingChunk(true);
+       
+       assertEquals("Subject автоматически Subject", ascii_cp1251_lcid1049.getSubject());
+       assertEquals("Body автоматически Body", ascii_cp1251_lcid1049.getTextBody());
+       assertEquals("<!DOCTYPE html><html><meta charset=\\\"windows-1251\\\"><body>HTML автоматически</body></html>", ascii_cp1251_lcid1049.getHtmlBody());
+      
+       assertEquals("Subject öäü Subject", ascii_utf_8_cp1252_lcid1031.getSubject());
+       assertEquals("Body öäü Body", ascii_utf_8_cp1252_lcid1031.getTextBody());
+       assertNull(ascii_utf_8_cp1252_lcid1031.getHtmlBody());
+       
+       assertEquals("Subject öäü Subject", ascii_utf_8_cp1252_lcid1031_html.getSubject());
+       assertEquals("Body öäü Body", ascii_utf_8_cp1252_lcid1031_html.getTextBody());
+       assertEquals("<!DOCTYPE html><html><meta charset=\\\"utf-8\\\"><body>HTML öäü</body></html>", ascii_utf_8_cp1252_lcid1031_html.getHtmlBody());
+       
+       assertEquals("Subject öäü Subject", htmlbodybinary_cp1251.getSubject());
+       assertNull(htmlbodybinary_cp1251.getTextBody());
+       assertEquals("<!DOCTYPE html><html><meta charset=\\\"utf-8\\\"><body>HTML автоматически</body></html>", htmlbodybinary_cp1251.getHtmlBody());
+       
+       assertEquals("Subject öäü Subject", htmlbodybinary_utf_8.getSubject());
+       assertNull(htmlbodybinary_utf_8.getTextBody());
+       assertEquals("<!DOCTYPE html><html><meta charset=\\\"utf-8\\\"><body>HTML öäü</body></html>", htmlbodybinary_utf_8.getHtmlBody());
+   }
+}

Propchange: poi/trunk/src/scratchpad/testcases/org/apache/poi/hsmf/Test7BitCodepage.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: poi/trunk/test-data/hsmf/ASCII_CP1251_LCID1049.msg
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/hsmf/ASCII_CP1251_LCID1049.msg?rev=1860043&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/hsmf/ASCII_CP1251_LCID1049.msg
------------------------------------------------------------------------------
    svn:mime-type = application/vnd.ms-outlook

Added: poi/trunk/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031.msg
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031.msg?rev=1860043&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031.msg
------------------------------------------------------------------------------
    svn:mime-type = application/vnd.ms-outlook

Added: poi/trunk/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031_HTML.msg
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031_HTML.msg?rev=1860043&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/hsmf/ASCII_UTF-8_CP1252_LCID1031_HTML.msg
------------------------------------------------------------------------------
    svn:mime-type = application/vnd.ms-outlook

Added: poi/trunk/test-data/hsmf/HTMLBodyBinary_CP1251.msg
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/hsmf/HTMLBodyBinary_CP1251.msg?rev=1860043&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/hsmf/HTMLBodyBinary_CP1251.msg
------------------------------------------------------------------------------
    svn:mime-type = application/vnd.ms-outlook

Added: poi/trunk/test-data/hsmf/HTMLBodyBinary_UTF-8.msg
URL: http://svn.apache.org/viewvc/poi/trunk/test-data/hsmf/HTMLBodyBinary_UTF-8.msg?rev=1860043&view=auto
==============================================================================
Binary file - no diff available.

Propchange: poi/trunk/test-data/hsmf/HTMLBodyBinary_UTF-8.msg
------------------------------------------------------------------------------
    svn:mime-type = application/vnd.ms-outlook



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@poi.apache.org
For additional commands, e-mail: commits-help@poi.apache.org