You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by wy...@apache.org on 2022/09/20 04:31:06 UTC

[asterixdb] branch master updated: [ASTERIXDB-2129][RT] Fix normalizing non-ascii strings

This is an automated email from the ASF dual-hosted git repository.

wyk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git


The following commit(s) were added to refs/heads/master by this push:
     new c60b7f4f01 [ASTERIXDB-2129][RT] Fix normalizing non-ascii strings
c60b7f4f01 is described below

commit c60b7f4f011548189dc78372ac109e228d849685
Author: Wail Alkowaileet <wa...@gmail.com>
AuthorDate: Mon Sep 19 16:13:51 2022 -0700

    [ASTERIXDB-2129][RT] Fix normalizing non-ascii strings
    
    - user model changes: no
    - storage format changes: no
    - interface changes: no
    
    Details:
    For example, single char strings with a 3-byte char can go out of the
    string's buffer boundry
    
    Change-Id: Ic169d5ff20f9bf5ce2ca36bab4ebd241bbc50dca
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17230
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Ali Alsuliman <al...@gmail.com>
---
 .../apache/hyracks/util/string/UTF8StringUtil.java | 31 +++++++++-------------
 .../hyracks/util/string/UTF8StringSample.java      |  5 +++-
 .../hyracks/util/string/UTF8StringUtilTest.java    |  9 ++++---
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 3eb868712b..c0475b1afc 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -244,12 +244,13 @@ public class UTF8StringUtil {
      * consistent with the comparison result.
      */
     public static int normalize(byte[] bytes, int start) {
-        int len = getUTFLength(bytes, start);
         long nk = 0;
+        int len = getUTFLength(bytes, start);
         int offset = start + getNumBytesToStoreLength(len);
+        int end = offset + len;
         for (int i = 0; i < 2; ++i) {
             nk <<= 16;
-            if (i < len) {
+            if (offset < end) {
                 nk += (charAt(bytes, offset)) & 0xffff;
                 offset += charSize(bytes, offset);
             }
@@ -498,19 +499,15 @@ public class UTF8StringUtil {
      * are exactly the same as for the <code>readUTF</code>
      * method of <code>DataInput</code>.
      *
-     * @param in
-     *            a data input stream.
+     * @param in a data input stream.
      * @return a Unicode string.
-     * @throws EOFException
-     *             if the input stream reaches the end
-     *             before all the bytes.
-     * @throws IOException
-     *             the stream has been closed and the contained
-     *             input stream does not support reading after close, or
-     *             another I/O error occurs.
-     * @throws UTFDataFormatException
-     *             if the bytes do not represent a
-     *             valid modified UTF-8 encoding of a Unicode string.
+     * @throws EOFException           if the input stream reaches the end
+     *                                before all the bytes.
+     * @throws IOException            the stream has been closed and the contained
+     *                                input stream does not support reading after close, or
+     *                                another I/O error occurs.
+     * @throws UTFDataFormatException if the bytes do not represent a
+     *                                valid modified UTF-8 encoding of a Unicode string.
      * @see java.io.DataInputStream#readUnsignedShort()
      */
     public static String readUTF8(DataInput in) throws IOException {
@@ -602,10 +599,8 @@ public class UTF8StringUtil {
     /**
      * Write a UTF8 String <code>str</code> into the DataOutput <code>out</code>
      *
-     * @param str,
-     *            a Unicode string;
-     * @param out,
-     *            a Data output stream.
+     * @param str, a Unicode string;
+     * @param out, a Data output stream.
      * @throws IOException
      */
     public static void writeUTF8(CharSequence str, DataOutput out) throws IOException {
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
index b114351a60..eb3a5b6a39 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
@@ -35,7 +35,8 @@ public class UTF8StringSample {
 
     public static final String STRING_LEN_3 = "xyz";
     public static final String STRING_UTF8_3 = "锟斤拷";
-    public static final String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà"; // one, two, three, and four bytes
+    // one, two, three, and four bytes
+    public static final String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà";
     public static final String STRING_UTF8_MIX_LOWERCASE = "\uD841\uDF0E\uD841\uDF31锟x斤y拷zà";
     public static final String STRING_NEEDS_2_JAVA_CHARS_1 = "\uD83D\uDE22\uD83D\uDE22\uD83D\uDC89\uD83D\uDC89";
     public static final String STRING_NEEDS_2_JAVA_CHARS_2 = "😢😢💉💉";
@@ -44,6 +45,8 @@ public class UTF8StringSample {
     public static final String STRING_EMOJI_FAMILY_OF_2 = "\uD83D\uDC68\u200D\uD83D\uDC66";
     public static final String EMOJI_BASKETBALL = "\uD83C\uDFC0";
 
+    public static final String THREE_BYTES_UTF8_CHAR = "ह";
+
     public static final String STRING_LEN_127 = generateStringRepeatBy(ONE_ASCII_CHAR, 127);
     public static final String STRING_LEN_128 = generateStringRepeatBy(ONE_ASCII_CHAR, 128);
 
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
index c7468d2c44..4eb1fc3f16 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
@@ -25,6 +25,7 @@ import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM;
 import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
 import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
 import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
+import static org.apache.hyracks.util.string.UTF8StringSample.THREE_BYTES_UTF8_CHAR;
 import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
 import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
 import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
@@ -77,13 +78,14 @@ public class UTF8StringUtilTest {
     }
 
     @Test
-    public void testCompareToAndNormolize() throws Exception {
+    public void testCompareToAndNormalize() throws Exception {
         testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
         testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
         testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
+        testCompare(THREE_BYTES_UTF8_CHAR, THREE_BYTES_UTF8_CHAR, OPTION.STANDARD);
     }
 
-    public boolean isSameSign(int r1, int r2) {
+    private static boolean isSameSign(int r1, int r2) {
         if (r1 > 0) {
             return r2 > 0;
         }
@@ -99,7 +101,7 @@ public class UTF8StringUtilTest {
         LOWERCASE
     }
 
-    public void testCompare(String str1, String str2, OPTION option) throws IOException {
+    private static void testCompare(String str1, String str2, OPTION option) {
         byte[] buffer1 = writeStringToBytes(str1);
         byte[] buffer2 = writeStringToBytes(str2);
 
@@ -117,7 +119,6 @@ public class UTF8StringUtilTest {
                 assertEquals(str1.compareToIgnoreCase(str2), lowerCaseCompareTo(buffer1, 0, buffer2, 0));
                 break;
         }
-
     }
 
     @Test