You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@qpid.apache.org by ro...@apache.org on 2014/09/23 15:31:23 UTC

svn commit: r1627019 - in /qpid/proton/trunk/proton-j/src: main/java/org/apache/qpid/proton/codec/EncoderImpl.java main/java/org/apache/qpid/proton/codec/StringType.java test/java/org/apache/qpid/proton/codec/StringTypeTest.java

Author: robbie
Date: Tue Sep 23 13:31:23 2014
New Revision: 1627019

URL: http://svn.apache.org/r1627019
Log:
PROTON-576: update String UTF-8 encoding to handle high range unicode characters / surrogate pairs

Applied patch from Dominic Evans with modifications by Rob Godfrey

Added:
    qpid/proton/trunk/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java
Modified:
    qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java
    qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java

Modified: qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java
URL: http://svn.apache.org/viewvc/qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java?rev=1627019&r1=1627018&r2=1627019&view=diff
==============================================================================
--- qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java (original)
+++ qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java Tue Sep 23 13:31:23 2014
@@ -21,7 +21,11 @@
 package org.apache.qpid.proton.codec;
 
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
 
 import org.apache.qpid.proton.amqp.Binary;
 import org.apache.qpid.proton.amqp.Decimal128;
@@ -770,31 +774,60 @@ public final class EncoderImpl implement
     void writeRaw(String string)
     {
         final int length = string.length();
-        char c;
+        int c;
 
         for (int i = 0; i < length; i++)
         {
             c = string.charAt(i);
-            if ((c >= 0x0001) && (c <= 0x007F))
+            if ((c & 0xFF80) == 0)          /* U+0000..U+007F */
             {
                 _buffer.put((byte) c);
-
             }
-            else if (c > 0x07FF)
+            else if ((c & 0xF800) == 0)     /* U+0080..U+07FF */
             {
-                _buffer.put((byte) (0xE0 | ((c >> 12) & 0x0F)));
-                _buffer.put((byte) (0x80 | ((c >>  6) & 0x3F)));
-                _buffer.put((byte) (0x80 | (c & 0x3F)));
+                _buffer.put((byte)(0xC0 | ((c >> 6) & 0x1F)));
+                _buffer.put((byte)(0x80 | (c & 0x3F)));
             }
-            else
+            else if ((c & 0xD800) != 0xD800)     /* U+0800..U+FFFF - excluding surrogate pairs */
             {
-                _buffer.put((byte) (0xC0 | ((c >>  6) & 0x1F)));
-                _buffer.put((byte) (0x80 | (c & 0x3F)));
+                _buffer.put((byte)(0xE0 | ((c >> 12) & 0x0F)));
+                _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F)));
+                _buffer.put((byte)(0x80 | (c & 0x3F)));
             }
-        }
+            else
+            {
+                int low;
 
-    }
+                if(((c & 0xDC00) == 0xDC00) || (++i == length) || ((low = string.charAt(i)) & 0xDC00) != 0xDC00)
+                {
+                    throw new IllegalArgumentException("String contains invalid Unicode code points");
+                }
 
+                c = 0x010000 + ((c & 0x03FF) << 10) + (low & 0x03FF);
 
 
+                if (c <= 0x3FFFF)     /* U+10000..U+3FFFF */
+                {
+                    _buffer.put((byte) 0xF0);
+                    _buffer.put((byte)(0x90 | ((c >> 12) & 0x2F)));
+                    _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F)));
+                    _buffer.put((byte)(0x80 | (c & 0x3F)));
+                }
+                else if (c <= 0xFFFFF)     /* U+40000..U+FFFFF */
+                {
+                    _buffer.put((byte)(0xF0 | ((c >> 18) & 0x03)));
+                    _buffer.put((byte)(0x80 | ((c >> 12) & 0x3F)));
+                    _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F)));
+                    _buffer.put((byte)(0x80 | (c & 0x3F)));
+                }
+                else                      /* U+100000..U+10FFFF */
+                {
+                    _buffer.put((byte)(0xF4));
+                    _buffer.put((byte)(0x80 | ((c >> 12) & 0x3F)));
+                    _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F)));
+                    _buffer.put((byte)(0x80 | (c & 0x3F)));
+                }
+            }
+        }
+    }
 }

Modified: qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java
URL: http://svn.apache.org/viewvc/qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java?rev=1627019&r1=1627018&r2=1627019&view=diff
==============================================================================
--- qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java (original)
+++ qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java Tue Sep 23 13:31:23 2014
@@ -83,29 +83,22 @@ public class StringType extends Abstract
         return encoding;
     }
 
-    private static int calculateUTF8Length(final String s)
+    static int calculateUTF8Length(final String s)
     {
         int len = s.length();
-        int i = 0;
-        final int length = s.length();
-        while(i < length)
+        final int length = len;
+        for (int i = 0; i < length; i++)
         {
-            char c = s.charAt(i);
-            if(c > 127)
+            int c = s.charAt(i);
+            if ((c & 0xFF80) != 0)         /* U+0080..    */
             {
                 len++;
-                if(c > 0x07ff)
+                // surrogate pairs should always combine to create a code point with a 4 octet representation
+                if(((c & 0xF800) != 0) && ((c & 0xD800) != 0xD800))     /* U+0800..  excluding surrogate pairs  */
                 {
                     len++;
-                    if(c >= 0xD800 && c <= 0xDBFF)
-                    {
-                        i++;
-                        len++;
-                    }
                 }
             }
-            i++;
-
         }
         return len;
     }

Added: qpid/proton/trunk/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java
URL: http://svn.apache.org/viewvc/qpid/proton/trunk/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java?rev=1627019&view=auto
==============================================================================
--- qpid/proton/trunk/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java (added)
+++ qpid/proton/trunk/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java Tue Sep 23 13:31:23 2014
@@ -0,0 +1,140 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.qpid.proton.codec;
+
+import static org.junit.Assert.assertEquals;
+
+import java.lang.Character.UnicodeBlock;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.junit.Test;
+
+import org.apache.qpid.proton.amqp.messaging.AmqpValue;
+
+/**
+ * Test the encoding and decoding of {@link StringType} values.
+ */
+public class StringTypeTest
+{
+    /**
+     * Loop over all the chars in a given {@link UnicodeBlock} and return a
+     * {@link Set <String>} containing all the possible values as their
+     * {@link String} values.
+     *
+     * @param block the {@link UnicodeBlock} to loop over
+     * @return a {@link Set <String>} containing all the possible values as
+     * {@link String} values
+     */
+    private static Set<String> getAllStringsFromUnicodeBlock(final UnicodeBlock block)
+    {
+        final Set<String> strings = new HashSet<String>();
+        for (int codePoint = 0; codePoint <= Character.MAX_CODE_POINT; codePoint++)
+        {
+            if (UnicodeBlock.of(codePoint) == block)
+            {
+                final int charCount = Character.charCount(codePoint);
+                final StringBuilder sb = new StringBuilder(
+                        charCount);
+                if (charCount == 1)
+                {
+                    sb.append(String.valueOf((char) codePoint));
+                }
+                else if (charCount == 2)
+                {
+                    sb.append(Character.highSurrogate(codePoint));
+                    sb.append(Character.lowSurrogate(codePoint));
+                }
+                else
+                {
+                    throw new IllegalArgumentException("Character.charCount of "
+                                                       + charCount + " not supported.");
+                }
+                strings.add(sb.toString());
+            }
+        }
+        return strings;
+    }
+
+
+    /**
+     * Test the encoding and decoding of various complicated Unicode characters
+     * which will end up as "surrogate pairs" when encoded to UTF-8
+     */
+    @Test
+    public void calculateUTF8Length()
+    {
+        for (final String input : generateTestData())
+        {
+            assertEquals("Incorrect string length calculated for string '"+input+"'",input.getBytes(StandardCharsets.UTF_8).length, StringType.calculateUTF8Length(input));
+        }
+    }
+
+    /**
+     * Test the encoding and decoding of various  Unicode characters
+     */
+    @Test
+    public void encodeDecodeStrings()
+    {
+        final DecoderImpl decoder = new DecoderImpl();
+        final EncoderImpl encoder = new EncoderImpl(decoder);
+        AMQPDefinedTypes.registerAllTypes(decoder, encoder);
+        final ByteBuffer bb = ByteBuffer.allocate(16);
+
+        for (final String input : generateTestData())
+        {
+            bb.clear();
+            final AmqpValue inputValue = new AmqpValue(input);
+            encoder.setByteBuffer(bb);
+            encoder.writeObject(inputValue);
+            bb.clear();
+            decoder.setByteBuffer(bb);
+            final AmqpValue outputValue = (AmqpValue) decoder.readObject();
+            assertEquals("Failed to round trip String correctly: ", input, outputValue.getValue());
+        }
+    }
+
+    // build up some test data with a set of suitable Unicode characters
+    private Set<String> generateTestData()
+    {
+        return new HashSet<String>()
+            {
+                private static final long serialVersionUID = 7331717267070233454L;
+
+                {
+                    // non-surrogate pair blocks
+                    addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.BASIC_LATIN));
+                    addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.LATIN_1_SUPPLEMENT));
+                    addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.GREEK));
+                    addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.LETTERLIKE_SYMBOLS));
+                    // blocks with surrogate pairs
+                    addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS));
+                    addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.MUSICAL_SYMBOLS));
+                    addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.EMOTICONS));
+                    addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.PLAYING_CARDS));
+                    addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_A));
+                    addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_B));
+                }
+            };
+    }
+}



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@qpid.apache.org
For additional commands, e-mail: commits-help@qpid.apache.org