You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mime4j-dev@james.apache.org by mw...@apache.org on 2009/08/29 23:33:14 UTC
svn commit: r809204 - in /james/mime4j/trunk/core/src: main/java/org/apache/james/mime4j/codec/DecoderUtil.java test/java/org/apache/james/mime4j/codec/DecoderUtilTest.java

Author: mwiederkehr
Date: Sat Aug 29 21:33:13 2009
New Revision: 809204

URL: http://svn.apache.org/viewvc?rev=809204&view=rev
Log:
use regular expression to match encoded words (MIME4J-138)

Modified:
    james/mime4j/trunk/core/src/main/java/org/apache/james/mime4j/codec/DecoderUtil.java
    james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/DecoderUtilTest.java

Modified: james/mime4j/trunk/core/src/main/java/org/apache/james/mime4j/codec/DecoderUtil.java
URL: http://svn.apache.org/viewvc/james/mime4j/trunk/core/src/main/java/org/apache/james/mime4j/codec/DecoderUtil.java?rev=809204&r1=809203&r2=809204&view=diff
==============================================================================
--- james/mime4j/trunk/core/src/main/java/org/apache/james/mime4j/codec/DecoderUtil.java (original)
+++ james/mime4j/trunk/core/src/main/java/org/apache/james/mime4j/codec/DecoderUtil.java Sat Aug 29 21:33:13 2009
@@ -23,6 +23,8 @@
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -33,7 +35,10 @@
  */
 public class DecoderUtil {
     private static Log log = LogFactory.getLog(DecoderUtil.class);
-    
+
+    private static final Pattern PATTERN_ENCODED_WORD = Pattern.compile(
+            "(.*?)=\\?([^\\?]+?)\\?(\\w)\\?([^\\?]+?)\\?=", Pattern.DOTALL);
+
     /**
      * Decodes a string containing quoted-printable encoded data. 
      * 
@@ -123,71 +128,57 @@
         byte[] decodedBytes = decodeQuotedPrintable(encodedText);
         return new String(decodedBytes, charset);
     }
-    
+
     /**
-     * Decodes a string containing encoded words as defined by RFC 2047.
-     * Encoded words in have the form 
-     * =?charset?enc?Encoded word?= where enc is either 'Q' or 'q' for 
-     * quoted-printable and 'B' or 'b' for Base64.
+     * Decodes a string containing encoded words as defined by RFC 2047. Encoded
+     * words have the form =?charset?enc?encoded-text?= where enc is either 'Q'
+     * or 'q' for quoted-printable and 'B' or 'b' for base64.
      * 
      * @param body the string to decode.
      * @return the decoded string.
      */
     public static String decodeEncodedWords(String body) {
-        int previousEnd = 0;
-        boolean previousWasEncoded = false;
+        int tailIndex = 0;
+        boolean lastMatchValid = false;
 
         StringBuilder sb = new StringBuilder();
 
-        while (true) {
-            int begin = body.indexOf("=?", previousEnd);
-            int end = begin == -1 ? -1 : body.indexOf("?=", begin + 2);
-            if (end == -1) {
-                if (previousEnd == 0)
-                    return body;
+        for (Matcher matcher = PATTERN_ENCODED_WORD.matcher(body); matcher.find();) {
+            String separator = matcher.group(1);
+            String mimeCharset = matcher.group(2);
+            String encoding = matcher.group(3);
+            String encodedText = matcher.group(4);
 
-                sb.append(body.substring(previousEnd));
-                return sb.toString();
-            }
-            end += 2;
-
-            String sep = body.substring(previousEnd, begin);
-
-            String decoded = decodeEncodedWord(body, begin, end);
+            String decoded = tryDecodeEncodedWord(mimeCharset, encoding, encodedText);
             if (decoded == null) {
-                sb.append(sep);
-                sb.append(body.substring(begin, end));
+                sb.append(matcher.group(0));
             } else {
-                if (!previousWasEncoded || !CharsetUtil.isWhitespace(sep)) {
-                    sb.append(sep);
+                if (!lastMatchValid || !CharsetUtil.isWhitespace(separator)) {
+                    sb.append(separator);
                 }
                 sb.append(decoded);
             }
 
-            previousEnd = end;
-            previousWasEncoded = decoded != null;
+            tailIndex = matcher.end();
+            lastMatchValid = decoded != null;
+        }
+
+        if (tailIndex == 0) {
+            return body;
+        } else {
+            sb.append(body.substring(tailIndex));
+            return sb.toString();
         }
     }
 
     // return null on error
-    private static String decodeEncodedWord(String body, int begin, int end) {
-        int qm1 = body.indexOf('?', begin + 2);
-        if (qm1 == end - 2)
-            return null;
-
-        int qm2 = body.indexOf('?', qm1 + 1);
-        if (qm2 == end - 2)
-            return null;
-
-        String mimeCharset = body.substring(begin + 2, qm1);
-        String encoding = body.substring(qm1 + 1, qm2);
-        String encodedText = body.substring(qm2 + 1, end - 2);
-
+    private static String tryDecodeEncodedWord(final String mimeCharset,
+            final String encoding, final String encodedText) {
         String charset = CharsetUtil.toJavaCharset(mimeCharset);
         if (charset == null) {
             if (log.isWarnEnabled()) {
                 log.warn("MIME charset '" + mimeCharset + "' in encoded word '"
-                        + body.substring(begin, end) + "' doesn't have a "
+                        + recombine(mimeCharset, encoding, encodedText) + "' doesn't have a "
                         + "corresponding Java charset");
             }
             return null;
@@ -195,7 +186,7 @@
             if (log.isWarnEnabled()) {
                 log.warn("Current JDK doesn't support decoding of charset '"
                         + charset + "' (MIME charset '" + mimeCharset
-                        + "' in encoded word '" + body.substring(begin, end)
+                        + "' in encoded word '" + recombine(mimeCharset, encoding, encodedText)
                         + "')");
             }
             return null;
@@ -204,7 +195,7 @@
         if (encodedText.length() == 0) {
             if (log.isWarnEnabled()) {
                 log.warn("Missing encoded text in encoded word: '"
-                        + body.substring(begin, end) + "'");
+                        + recombine(mimeCharset, encoding, encodedText) + "'");
             }
             return null;
         }
@@ -217,7 +208,7 @@
             } else {
                 if (log.isWarnEnabled()) {
                     log.warn("Warning: Unknown encoding in encoded word '"
-                            + body.substring(begin, end) + "'");
+                            + recombine(mimeCharset, encoding, encodedText) + "'");
                 }
                 return null;
             }
@@ -225,18 +216,23 @@
             // should not happen because of isDecodingSupported check above
             if (log.isWarnEnabled()) {
                 log.warn("Unsupported encoding in encoded word '"
-                        + body.substring(begin, end) + "'", e);
+                        + recombine(mimeCharset, encoding, encodedText) + "'", e);
             }
             return null;
         } catch (RuntimeException e) {
             if (log.isWarnEnabled()) {
                 log.warn("Could not decode encoded word '"
-                        + body.substring(begin, end) + "'", e);
+                        + recombine(mimeCharset, encoding, encodedText) + "'", e);
             }
             return null;
         }
     }
 
+    private static String recombine(final String mimeCharset,
+            final String encoding, final String encodedText) {
+        return "=?" + mimeCharset + "?" + encoding + "?" + encodedText + "?=";
+    }
+
     // Replace _ with =20
     private static String replaceUnderscores(String str) {
         // probably faster than String#replace(CharSequence, CharSequence)

Modified: james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/DecoderUtilTest.java
URL: http://svn.apache.org/viewvc/james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/DecoderUtilTest.java?rev=809204&r1=809203&r2=809204&view=diff
==============================================================================
--- james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/DecoderUtilTest.java (original)
+++ james/mime4j/trunk/core/src/test/java/org/apache/james/mime4j/codec/DecoderUtilTest.java Sat Aug 29 21:33:13 2009
@@ -100,6 +100,16 @@
         assertEquals("a b", DecoderUtil.decodeEncodedWords("=?ISO-8859-1?Q?a?= =?ISO-8859-2?Q?_b?="));
     }
 
+    // see MIME4J-138
+    public void testEncodedTextMayStartWithAnEqualsSign() {
+        assertEquals(" foo", DecoderUtil.decodeEncodedWords("=?utf-8?Q?=20foo?="));
+        assertEquals("Re: How to place a view at the bottom with a 100% width",
+            DecoderUtil.decodeEncodedWords("=?utf-8?Q?Re:=20How=20to=20place=20a=20view=20at=20the=20bottom?= "
+                    + "=?utf-8?Q?=20with=20a=20100%=20width?="));
+        assertEquals("Test \u00fc and more",
+            DecoderUtil.decodeEncodedWords("Test =?ISO-8859-1?Q?=FC_?= =?ISO-8859-1?Q?and_more?="));
+    }
+
     public void testNonWhiteSpaceBetweenEncodedWordsIsRetained() {
         assertEquals("a b c", DecoderUtil.decodeEncodedWords("=?ISO-8859-1?Q?a?= b =?ISO-8859-1?Q?c?="));
         assertEquals("a\rb\nc", DecoderUtil.decodeEncodedWords("=?ISO-8859-1?Q?a?=\rb\n=?ISO-8859-1?Q?c?="));