You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@qpid.apache.org by ro...@apache.org on 2014/09/23 15:31:23 UTC
svn commit: r1627019 - in /qpid/proton/trunk/proton-j/src:
main/java/org/apache/qpid/proton/codec/EncoderImpl.java
main/java/org/apache/qpid/proton/codec/StringType.java
test/java/org/apache/qpid/proton/codec/StringTypeTest.java
Author: robbie
Date: Tue Sep 23 13:31:23 2014
New Revision: 1627019
URL: http://svn.apache.org/r1627019
Log:
PROTON-576: update String UTF-8 encoding to handle high range unicode characters / surrogate pairs
Applied patch from Dominic Evans with modifications by Rob Godfrey
Added:
qpid/proton/trunk/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java
Modified:
qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java
qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java
Modified: qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java
URL: http://svn.apache.org/viewvc/qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java?rev=1627019&r1=1627018&r2=1627019&view=diff
==============================================================================
--- qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java (original)
+++ qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java Tue Sep 23 13:31:23 2014
@@ -21,7 +21,11 @@
package org.apache.qpid.proton.codec;
import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
import org.apache.qpid.proton.amqp.Binary;
import org.apache.qpid.proton.amqp.Decimal128;
@@ -770,31 +774,60 @@ public final class EncoderImpl implement
void writeRaw(String string)
{
final int length = string.length();
- char c;
+ int c;
for (int i = 0; i < length; i++)
{
c = string.charAt(i);
- if ((c >= 0x0001) && (c <= 0x007F))
+ if ((c & 0xFF80) == 0) /* U+0000..U+007F */
{
_buffer.put((byte) c);
-
}
- else if (c > 0x07FF)
+ else if ((c & 0xF800) == 0) /* U+0080..U+07FF */
{
- _buffer.put((byte) (0xE0 | ((c >> 12) & 0x0F)));
- _buffer.put((byte) (0x80 | ((c >> 6) & 0x3F)));
- _buffer.put((byte) (0x80 | (c & 0x3F)));
+ _buffer.put((byte)(0xC0 | ((c >> 6) & 0x1F)));
+ _buffer.put((byte)(0x80 | (c & 0x3F)));
}
- else
+ else if ((c & 0xD800) != 0xD800) /* U+0800..U+FFFF - excluding surrogate pairs */
{
- _buffer.put((byte) (0xC0 | ((c >> 6) & 0x1F)));
- _buffer.put((byte) (0x80 | (c & 0x3F)));
+ _buffer.put((byte)(0xE0 | ((c >> 12) & 0x0F)));
+ _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F)));
+ _buffer.put((byte)(0x80 | (c & 0x3F)));
}
- }
+ else
+ {
+ int low;
- }
+ if(((c & 0xDC00) == 0xDC00) || (++i == length) || ((low = string.charAt(i)) & 0xDC00) != 0xDC00)
+ {
+ throw new IllegalArgumentException("String contains invalid Unicode code points");
+ }
+ c = 0x010000 + ((c & 0x03FF) << 10) + (low & 0x03FF);
+ if (c <= 0x3FFFF) /* U+10000..U+3FFFF */
+ {
+ _buffer.put((byte) 0xF0);
+ _buffer.put((byte)(0x90 | ((c >> 12) & 0x2F)));
+ _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F)));
+ _buffer.put((byte)(0x80 | (c & 0x3F)));
+ }
+ else if (c <= 0xFFFFF) /* U+40000..U+FFFFF */
+ {
+ _buffer.put((byte)(0xF0 | ((c >> 18) & 0x03)));
+ _buffer.put((byte)(0x80 | ((c >> 12) & 0x3F)));
+ _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F)));
+ _buffer.put((byte)(0x80 | (c & 0x3F)));
+ }
+ else /* U+100000..U+10FFFF */
+ {
+ _buffer.put((byte)(0xF4));
+ _buffer.put((byte)(0x80 | ((c >> 12) & 0x3F)));
+ _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F)));
+ _buffer.put((byte)(0x80 | (c & 0x3F)));
+ }
+ }
+ }
+ }
}
Modified: qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java
URL: http://svn.apache.org/viewvc/qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java?rev=1627019&r1=1627018&r2=1627019&view=diff
==============================================================================
--- qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java (original)
+++ qpid/proton/trunk/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java Tue Sep 23 13:31:23 2014
@@ -83,29 +83,22 @@ public class StringType extends Abstract
return encoding;
}
- private static int calculateUTF8Length(final String s)
+ static int calculateUTF8Length(final String s)
{
int len = s.length();
- int i = 0;
- final int length = s.length();
- while(i < length)
+ final int length = len;
+ for (int i = 0; i < length; i++)
{
- char c = s.charAt(i);
- if(c > 127)
+ int c = s.charAt(i);
+ if ((c & 0xFF80) != 0) /* U+0080.. */
{
len++;
- if(c > 0x07ff)
+ // surrogate pairs should always combine to create a code point with a 4 octet representation
+ if(((c & 0xF800) != 0) && ((c & 0xD800) != 0xD800)) /* U+0800.. excluding surrogate pairs */
{
len++;
- if(c >= 0xD800 && c <= 0xDBFF)
- {
- i++;
- len++;
- }
}
}
- i++;
-
}
return len;
}
Added: qpid/proton/trunk/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java
URL: http://svn.apache.org/viewvc/qpid/proton/trunk/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java?rev=1627019&view=auto
==============================================================================
--- qpid/proton/trunk/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java (added)
+++ qpid/proton/trunk/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java Tue Sep 23 13:31:23 2014
@@ -0,0 +1,140 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.qpid.proton.codec;
+
+import static org.junit.Assert.assertEquals;
+
+import java.lang.Character.UnicodeBlock;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.junit.Test;
+
+import org.apache.qpid.proton.amqp.messaging.AmqpValue;
+
+/**
+ * Test the encoding and decoding of {@link StringType} values.
+ */
+public class StringTypeTest
+{
+ /**
+ * Loop over all the chars in a given {@link UnicodeBlock} and return a
+ * {@link Set <String>} containing all the possible values as their
+ * {@link String} values.
+ *
+ * @param block the {@link UnicodeBlock} to loop over
+ * @return a {@link Set <String>} containing all the possible values as
+ * {@link String} values
+ */
+ private static Set<String> getAllStringsFromUnicodeBlock(final UnicodeBlock block)
+ {
+ final Set<String> strings = new HashSet<String>();
+ for (int codePoint = 0; codePoint <= Character.MAX_CODE_POINT; codePoint++)
+ {
+ if (UnicodeBlock.of(codePoint) == block)
+ {
+ final int charCount = Character.charCount(codePoint);
+ final StringBuilder sb = new StringBuilder(
+ charCount);
+ if (charCount == 1)
+ {
+ sb.append(String.valueOf((char) codePoint));
+ }
+ else if (charCount == 2)
+ {
+ sb.append(Character.highSurrogate(codePoint));
+ sb.append(Character.lowSurrogate(codePoint));
+ }
+ else
+ {
+ throw new IllegalArgumentException("Character.charCount of "
+ + charCount + " not supported.");
+ }
+ strings.add(sb.toString());
+ }
+ }
+ return strings;
+ }
+
+
+ /**
+ * Test the encoding and decoding of various complicated Unicode characters
+ * which will end up as "surrogate pairs" when encoded to UTF-8
+ */
+ @Test
+ public void calculateUTF8Length()
+ {
+ for (final String input : generateTestData())
+ {
+ assertEquals("Incorrect string length calculated for string '"+input+"'",input.getBytes(StandardCharsets.UTF_8).length, StringType.calculateUTF8Length(input));
+ }
+ }
+
+ /**
+ * Test the encoding and decoding of various Unicode characters
+ */
+ @Test
+ public void encodeDecodeStrings()
+ {
+ final DecoderImpl decoder = new DecoderImpl();
+ final EncoderImpl encoder = new EncoderImpl(decoder);
+ AMQPDefinedTypes.registerAllTypes(decoder, encoder);
+ final ByteBuffer bb = ByteBuffer.allocate(16);
+
+ for (final String input : generateTestData())
+ {
+ bb.clear();
+ final AmqpValue inputValue = new AmqpValue(input);
+ encoder.setByteBuffer(bb);
+ encoder.writeObject(inputValue);
+ bb.clear();
+ decoder.setByteBuffer(bb);
+ final AmqpValue outputValue = (AmqpValue) decoder.readObject();
+ assertEquals("Failed to round trip String correctly: ", input, outputValue.getValue());
+ }
+ }
+
+ // build up some test data with a set of suitable Unicode characters
+ private Set<String> generateTestData()
+ {
+ return new HashSet<String>()
+ {
+ private static final long serialVersionUID = 7331717267070233454L;
+
+ {
+ // non-surrogate pair blocks
+ addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.BASIC_LATIN));
+ addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.LATIN_1_SUPPLEMENT));
+ addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.GREEK));
+ addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.LETTERLIKE_SYMBOLS));
+ // blocks with surrogate pairs
+ addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS));
+ addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.MUSICAL_SYMBOLS));
+ addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.EMOTICONS));
+ addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.PLAYING_CARDS));
+ addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_A));
+ addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_B));
+ }
+ };
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@qpid.apache.org
For additional commands, e-mail: commits-help@qpid.apache.org