You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@asterixdb.apache.org by AsterixDB Code Review <do...@asterix-gerrit.ics.uci.edu> on 2022/09/16 20:01:05 UTC

Change in asterixdb[master]: [ASTERIXDB-2129][RT] Fix normalizing 3-byte chars

From Wail Alkowaileet <wa...@gmail.com>:

Wail Alkowaileet has uploaded this change for review. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17230 )


Change subject: [ASTERIXDB-2129][RT] Fix normalizing 3-byte chars
......................................................................

[ASTERIXDB-2129][RT] Fix normalizing 3-byte chars

- user model changes: no
- storage format changes: no
- interface changes: no

Details:
Single char strings with a 3-byte char can go out of the
string's buffer boundry

Change-Id: Ic169d5ff20f9bf5ce2ca36bab4ebd241bbc50dca
---
M hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
M hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
M hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/normalizers/UTF8StringNormalizedKeyComputerFactory.java
M hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
4 files changed, 39 insertions(+), 23 deletions(-)



  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/30/17230/1

diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/normalizers/UTF8StringNormalizedKeyComputerFactory.java b/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/normalizers/UTF8StringNormalizedKeyComputerFactory.java
index fefcebd..c48ad7f 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/normalizers/UTF8StringNormalizedKeyComputerFactory.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/normalizers/UTF8StringNormalizedKeyComputerFactory.java
@@ -46,7 +46,7 @@
         return new INormalizedKeyComputer() {
             @Override
             public void normalize(byte[] bytes, int start, int length, int[] normalizedKeys, int keyStart) {
-                normalizedKeys[keyStart] = UTF8StringUtil.normalize(bytes, start);
+                normalizedKeys[keyStart] = UTF8StringUtil.normalize(bytes, start, length);
             }
 
             @Override
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 3eb8687..b461fc4 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -243,13 +243,13 @@
      * the actual bytes only if the normalized key is equal. Thus this normalized key must be
      * consistent with the comparison result.
      */
-    public static int normalize(byte[] bytes, int start) {
+    public static int normalize(byte[] bytes, int start, int length) {
         int len = getUTFLength(bytes, start);
         long nk = 0;
         int offset = start + getNumBytesToStoreLength(len);
         for (int i = 0; i < 2; ++i) {
             nk <<= 16;
-            if (i < len) {
+            if (offset < length) {
                 nk += (charAt(bytes, offset)) & 0xffff;
                 offset += charSize(bytes, offset);
             }
@@ -498,19 +498,15 @@
      * are exactly the same as for the <code>readUTF</code>
      * method of <code>DataInput</code>.
      *
-     * @param in
-     *            a data input stream.
+     * @param in a data input stream.
      * @return a Unicode string.
-     * @throws EOFException
-     *             if the input stream reaches the end
-     *             before all the bytes.
-     * @throws IOException
-     *             the stream has been closed and the contained
-     *             input stream does not support reading after close, or
-     *             another I/O error occurs.
-     * @throws UTFDataFormatException
-     *             if the bytes do not represent a
-     *             valid modified UTF-8 encoding of a Unicode string.
+     * @throws EOFException           if the input stream reaches the end
+     *                                before all the bytes.
+     * @throws IOException            the stream has been closed and the contained
+     *                                input stream does not support reading after close, or
+     *                                another I/O error occurs.
+     * @throws UTFDataFormatException if the bytes do not represent a
+     *                                valid modified UTF-8 encoding of a Unicode string.
      * @see java.io.DataInputStream#readUnsignedShort()
      */
     public static String readUTF8(DataInput in) throws IOException {
@@ -602,10 +598,8 @@
     /**
      * Write a UTF8 String <code>str</code> into the DataOutput <code>out</code>
      *
-     * @param str,
-     *            a Unicode string;
-     * @param out,
-     *            a Data output stream.
+     * @param str, a Unicode string;
+     * @param out, a Data output stream.
      * @throws IOException
      */
     public static void writeUTF8(CharSequence str, DataOutput out) throws IOException {
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
index b114351..123d497 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
@@ -35,7 +35,8 @@
 
     public static final String STRING_LEN_3 = "xyz";
     public static final String STRING_UTF8_3 = "锟斤拷";
-    public static final String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà"; // one, two, three, and four bytes
+    public static final String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà";
+    // one, two, three, and four bytes
     public static final String STRING_UTF8_MIX_LOWERCASE = "\uD841\uDF0E\uD841\uDF31锟x斤y拷zà";
     public static final String STRING_NEEDS_2_JAVA_CHARS_1 = "\uD83D\uDE22\uD83D\uDE22\uD83D\uDC89\uD83D\uDC89";
     public static final String STRING_NEEDS_2_JAVA_CHARS_2 = "😢😢💉💉";
@@ -44,6 +45,8 @@
     public static final String STRING_EMOJI_FAMILY_OF_2 = "\uD83D\uDC68\u200D\uD83D\uDC66";
     public static final String EMOJI_BASKETBALL = "\uD83C\uDFC0";
 
+    public static final String THREE_BYTES_UTF8_CHAR = "ह";
+
     public static final String STRING_LEN_127 = generateStringRepeatBy(ONE_ASCII_CHAR, 127);
     public static final String STRING_LEN_128 = generateStringRepeatBy(ONE_ASCII_CHAR, 128);
 
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
index c7468d2..bc6bb3f 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
@@ -25,6 +25,7 @@
 import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
 import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
 import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
+import static org.apache.hyracks.util.string.UTF8StringSample.THREE_BYTES_UTF8_CHAR;
 import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
 import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
 import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
@@ -77,10 +78,11 @@
     }
 
     @Test
-    public void testCompareToAndNormolize() throws Exception {
+    public void testCompareToAndNormalize() throws Exception {
         testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
         testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
         testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
+        testCompare(THREE_BYTES_UTF8_CHAR, THREE_BYTES_UTF8_CHAR, OPTION.STANDARD);
     }
 
     public boolean isSameSign(int r1, int r2) {
@@ -106,8 +108,8 @@
         switch (option) {
             case STANDARD:
                 assertEquals(str1.compareTo(str2), compareTo(buffer1, 0, buffer2, 0));
-                int n1 = normalize(buffer1, 0);
-                int n2 = normalize(buffer2, 0);
+                int n1 = normalize(buffer1, 0, buffer1.length);
+                int n2 = normalize(buffer2, 0, buffer2.length);
                 assertTrue(isSameSign(str1.compareTo(str2), n1 - n2));
                 break;
             case RAW_BYTE:

-- 
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17230
To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings

Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Change-Id: Ic169d5ff20f9bf5ce2ca36bab4ebd241bbc50dca
Gerrit-Change-Number: 17230
Gerrit-PatchSet: 1
Gerrit-Owner: Wail Alkowaileet <wa...@gmail.com>
Gerrit-MessageType: newchange