You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@asterixdb.apache.org by AsterixDB Code Review <do...@asterix-gerrit.ics.uci.edu> on 2022/09/16 20:01:05 UTC
Change in asterixdb[master]: [ASTERIXDB-2129][RT] Fix normalizing 3-byte chars
From Wail Alkowaileet <wa...@gmail.com>:
Wail Alkowaileet has uploaded this change for review. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17230 )
Change subject: [ASTERIXDB-2129][RT] Fix normalizing 3-byte chars
......................................................................
[ASTERIXDB-2129][RT] Fix normalizing 3-byte chars
- user model changes: no
- storage format changes: no
- interface changes: no
Details:
Single char strings with a 3-byte char can go out of the
string's buffer boundry
Change-Id: Ic169d5ff20f9bf5ce2ca36bab4ebd241bbc50dca
---
M hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
M hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
M hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/normalizers/UTF8StringNormalizedKeyComputerFactory.java
M hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
4 files changed, 39 insertions(+), 23 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/30/17230/1
diff --git a/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/normalizers/UTF8StringNormalizedKeyComputerFactory.java b/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/normalizers/UTF8StringNormalizedKeyComputerFactory.java
index fefcebd..c48ad7f 100644
--- a/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/normalizers/UTF8StringNormalizedKeyComputerFactory.java
+++ b/hyracks-fullstack/hyracks/hyracks-dataflow-common/src/main/java/org/apache/hyracks/dataflow/common/data/normalizers/UTF8StringNormalizedKeyComputerFactory.java
@@ -46,7 +46,7 @@
return new INormalizedKeyComputer() {
@Override
public void normalize(byte[] bytes, int start, int length, int[] normalizedKeys, int keyStart) {
- normalizedKeys[keyStart] = UTF8StringUtil.normalize(bytes, start);
+ normalizedKeys[keyStart] = UTF8StringUtil.normalize(bytes, start, length);
}
@Override
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 3eb8687..b461fc4 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -243,13 +243,13 @@
* the actual bytes only if the normalized key is equal. Thus this normalized key must be
* consistent with the comparison result.
*/
- public static int normalize(byte[] bytes, int start) {
+ public static int normalize(byte[] bytes, int start, int length) {
int len = getUTFLength(bytes, start);
long nk = 0;
int offset = start + getNumBytesToStoreLength(len);
for (int i = 0; i < 2; ++i) {
nk <<= 16;
- if (i < len) {
+ if (offset < length) {
nk += (charAt(bytes, offset)) & 0xffff;
offset += charSize(bytes, offset);
}
@@ -498,19 +498,15 @@
* are exactly the same as for the <code>readUTF</code>
* method of <code>DataInput</code>.
*
- * @param in
- * a data input stream.
+ * @param in a data input stream.
* @return a Unicode string.
- * @throws EOFException
- * if the input stream reaches the end
- * before all the bytes.
- * @throws IOException
- * the stream has been closed and the contained
- * input stream does not support reading after close, or
- * another I/O error occurs.
- * @throws UTFDataFormatException
- * if the bytes do not represent a
- * valid modified UTF-8 encoding of a Unicode string.
+ * @throws EOFException if the input stream reaches the end
+ * before all the bytes.
+ * @throws IOException the stream has been closed and the contained
+ * input stream does not support reading after close, or
+ * another I/O error occurs.
+ * @throws UTFDataFormatException if the bytes do not represent a
+ * valid modified UTF-8 encoding of a Unicode string.
* @see java.io.DataInputStream#readUnsignedShort()
*/
public static String readUTF8(DataInput in) throws IOException {
@@ -602,10 +598,8 @@
/**
* Write a UTF8 String <code>str</code> into the DataOutput <code>out</code>
*
- * @param str,
- * a Unicode string;
- * @param out,
- * a Data output stream.
+ * @param str, a Unicode string;
+ * @param out, a Data output stream.
* @throws IOException
*/
public static void writeUTF8(CharSequence str, DataOutput out) throws IOException {
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
index b114351..123d497 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
@@ -35,7 +35,8 @@
public static final String STRING_LEN_3 = "xyz";
public static final String STRING_UTF8_3 = "锟斤拷";
- public static final String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà"; // one, two, three, and four bytes
+ public static final String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà";
+ // one, two, three, and four bytes
public static final String STRING_UTF8_MIX_LOWERCASE = "\uD841\uDF0E\uD841\uDF31锟x斤y拷zà";
public static final String STRING_NEEDS_2_JAVA_CHARS_1 = "\uD83D\uDE22\uD83D\uDE22\uD83D\uDC89\uD83D\uDC89";
public static final String STRING_NEEDS_2_JAVA_CHARS_2 = "😢😢💉💉";
@@ -44,6 +45,8 @@
public static final String STRING_EMOJI_FAMILY_OF_2 = "\uD83D\uDC68\u200D\uD83D\uDC66";
public static final String EMOJI_BASKETBALL = "\uD83C\uDFC0";
+ public static final String THREE_BYTES_UTF8_CHAR = "ह";
+
public static final String STRING_LEN_127 = generateStringRepeatBy(ONE_ASCII_CHAR, 127);
public static final String STRING_LEN_128 = generateStringRepeatBy(ONE_ASCII_CHAR, 128);
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
index c7468d2..bc6bb3f 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
@@ -25,6 +25,7 @@
import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
+import static org.apache.hyracks.util.string.UTF8StringSample.THREE_BYTES_UTF8_CHAR;
import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
@@ -77,10 +78,11 @@
}
@Test
- public void testCompareToAndNormolize() throws Exception {
+ public void testCompareToAndNormalize() throws Exception {
testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
+ testCompare(THREE_BYTES_UTF8_CHAR, THREE_BYTES_UTF8_CHAR, OPTION.STANDARD);
}
public boolean isSameSign(int r1, int r2) {
@@ -106,8 +108,8 @@
switch (option) {
case STANDARD:
assertEquals(str1.compareTo(str2), compareTo(buffer1, 0, buffer2, 0));
- int n1 = normalize(buffer1, 0);
- int n2 = normalize(buffer2, 0);
+ int n1 = normalize(buffer1, 0, buffer1.length);
+ int n2 = normalize(buffer2, 0, buffer2.length);
assertTrue(isSameSign(str1.compareTo(str2), n1 - n2));
break;
case RAW_BYTE:
--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17230
To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Change-Id: Ic169d5ff20f9bf5ce2ca36bab4ebd241bbc50dca
Gerrit-Change-Number: 17230
Gerrit-PatchSet: 1
Gerrit-Owner: Wail Alkowaileet <wa...@gmail.com>
Gerrit-MessageType: newchange