You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by dl...@apache.org on 2020/07/22 00:05:22 UTC

[asterixdb] branch master updated: [ASTERIXDB-2762] Use code point as the unit in substr()

This is an automated email from the ASF dual-hosted git repository.

dlych pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git


The following commit(s) were added to refs/heads/master by this push:
     new 0ea86a7  [ASTERIXDB-2762] Use code point as the unit in substr()
0ea86a7 is described below

commit 0ea86a76b4499b27e0bd32f93a3df38f9ade3328
Author: Rui Guo <ru...@uci.edu>
AuthorDate: Tue Jul 21 13:32:29 2020 -0700

    [ASTERIXDB-2762] Use code point as the unit in substr()
    
    This commit aims to use code point as the unit in substr().
    
    Currently, Java char (2 bytes) is used as the unit in substr(),
    however, for non-English characters such as Emoji and Korean,
    one character may have multiple bytes and thus can be splitted into a
    few illegal parts if we use Java char as the unit.
    Instead, code point is a more natural unit to split characters.
    
    Change-Id: I5c38cfd7abcf6f1c1f23a9f74dfd3181531d8c0f
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7263
    Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Dmitry Lychagin <dm...@couchbase.com>
---
 .../substring_multi_codepoint_01.1.query.sqlpp     | 24 ++++++++++++
 .../substring_multi_codepoint_01.1.adm             |  1 +
 .../test/resources/runtimets/testsuite_sqlpp.xml   |  5 +++
 .../src/main/markdown/builtins/2_string_common.md  |  2 +
 .../data/std/primitive/UTF8StringPointable.java    | 44 +++++++++++++---------
 .../hyracks/data/std/util/UTF8StringBuilder.java   |  9 +++++
 .../std/primitive/UTF8StringPointableTest.java     | 22 +++++++++++
 .../apache/hyracks/util/string/UTF8StringUtil.java | 44 ++++++++++++++++++++++
 .../hyracks/util/string/UTF8StringSample.java      |  3 ++
 9 files changed, 137 insertions(+), 17 deletions(-)

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/substring_multi_codepoint_01/substring_multi_codepoint_01.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/substring_multi_codepoint_01/substring_multi_codepoint_01.1.query.sqlpp
new file mode 100644
index 0000000..16e75a4
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/substring_multi_codepoint_01/substring_multi_codepoint_01.1.query.sqlpp
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+{
+  '2 people in a 4-people family': SUBSTR("๐Ÿ‘จโ€๐Ÿ‘จโ€๐Ÿ‘ฆโ€๐Ÿ‘ฆ", 2, 3),
+  'Korean test 0': SUBSTR('แ„’แ…กแ†ซแ„€แ…ณแ†ฏ', 3,3),
+  'Korean test 1': SUBSTR('แ„’แ…กแ†ซแ„€แ…ณแ†ฏ', 2,2)
+};
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/substring_multi_codepoint_01/substring_multi_codepoint_01.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/substring_multi_codepoint_01/substring_multi_codepoint_01.1.adm
new file mode 100644
index 0000000..5157e8a
--- /dev/null
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/substring_multi_codepoint_01/substring_multi_codepoint_01.1.adm
@@ -0,0 +1 @@
+{ "2 people in a 4-people family": "๐Ÿ‘จโ€๐Ÿ‘ฆ", "Korean test 0": "แ„€แ…ณแ†ฏ", "Korean test 1": "แ†ซแ„€" }
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index 1e11bb8..8f9e1e8 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -9908,6 +9908,11 @@
       </compilation-unit>
     </test-case>
     <test-case FilePath="string">
+      <compilation-unit name="substring_multi_codepoint_01">
+        <output-dir compare="Text">substring_multi_codepoint_01</output-dir>
+      </compilation-unit>
+    </test-case>
+    <test-case FilePath="string">
       <compilation-unit name="substring-after-1">
         <output-dir compare="Text">substring-after-1</output-dir>
       </compilation-unit>
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
index 58d0e10..528ff5d 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
@@ -554,6 +554,8 @@
         substr(string, offset[, length])
 
  * Returns the substring from the given string `string` based on the given start offset `offset` with the optional `length`. 
+ Note that both of the `offset` and `length` are in the unit of code point
+ (e.g. the emoji family ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ has 7 code points).
  The function uses the 0-based position. Another version of the function uses the 1-based position. Below are the
  aliases for each version:
 
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 9ff2f41..944b317 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -114,6 +114,14 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
         return UTF8StringUtil.charSize(bytes, start + offset);
     }
 
+    public int codePointAt(int offset) {
+        return UTF8StringUtil.codePointAt(bytes, start + offset);
+    }
+
+    public int codePointSize(int offset) {
+        return UTF8StringUtil.codePointSize(bytes, start + offset);
+    }
+
     /**
      * Gets the length of the string in characters.
      * The first time call will need to go through the entire string, the following call will just return the pre-caculated result
@@ -337,43 +345,45 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
     }
 
     /**
+     * Return the substring. Note that the offset and length are in the unit of code point.
      * @return {@code true} if substring was successfully written into given {@code out}, or
-     *         {@code false} if substring could not be obtained ({@code charOffset} or {@code charLength}
+     *         {@code false} if substring could not be obtained ({@code codePointOffset} or {@code codePointLength}
      *         are less than 0 or starting position is greater than the input length)
      */
-    public boolean substr(int charOffset, int charLength, UTF8StringBuilder builder, GrowableArray out)
+    public boolean substr(int codePointOffset, int codePointLength, UTF8StringBuilder builder, GrowableArray out)
             throws IOException {
-        return substr(this, charOffset, charLength, builder, out);
+        return substr(this, codePointOffset, codePointLength, builder, out);
     }
 
     /**
+     * Return the substring. Note that the offset and length are in the unit of code point.
      * @return {@code true} if substring was successfully written into given {@code out}, or
-     *         {@code false} if substring could not be obtained ({@code charOffset} or {@code charLength}
+     *         {@code false} if substring could not be obtained ({@code codePointOffset} or {@code codePointLength}
      *         are less than 0 or starting position is greater than the input length)
      */
-    public static boolean substr(UTF8StringPointable src, int charOffset, int charLength, UTF8StringBuilder builder,
-            GrowableArray out) throws IOException {
-        if (charOffset < 0 || charLength < 0) {
+    public static boolean substr(UTF8StringPointable src, int codePointOffset, int codePointLength,
+            UTF8StringBuilder builder, GrowableArray out) throws IOException {
+        if (codePointOffset < 0 || codePointLength < 0) {
             return false;
         }
 
         int utfLen = src.getUTF8Length();
-        int chIdx = 0;
+        int codePointIdx = 0;
         int byteIdx = 0;
-        while (byteIdx < utfLen && chIdx < charOffset) {
-            byteIdx += src.charSize(src.getMetaDataLength() + byteIdx);
-            chIdx++;
+        while (byteIdx < utfLen && codePointIdx < codePointOffset) {
+            byteIdx += src.codePointSize(src.getMetaDataLength() + byteIdx);
+            codePointIdx++;
         }
         if (byteIdx >= utfLen) {
             return false;
         }
 
-        builder.reset(out, Math.min(utfLen - byteIdx, (int) (charLength * 1.0 * byteIdx / chIdx)));
-        chIdx = 0;
-        while (byteIdx < utfLen && chIdx < charLength) {
-            builder.appendChar(src.charAt(src.getMetaDataLength() + byteIdx));
-            chIdx++;
-            byteIdx += src.charSize(src.getMetaDataLength() + byteIdx);
+        builder.reset(out, Math.min(utfLen - byteIdx, (int) (codePointLength * 1.0 * byteIdx / codePointIdx)));
+        codePointIdx = 0;
+        while (byteIdx < utfLen && codePointIdx < codePointLength) {
+            builder.appendCodePoint(src.codePointAt(src.getMetaDataLength() + byteIdx));
+            codePointIdx++;
+            byteIdx += src.codePointSize(src.getMetaDataLength() + byteIdx);
         }
         builder.finish();
         return true;
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringBuilder.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringBuilder.java
index 2300c06..147ac86 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringBuilder.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/UTF8StringBuilder.java
@@ -23,10 +23,19 @@ import org.apache.hyracks.util.string.UTF8StringUtil;
 
 public class UTF8StringBuilder extends AbstractVarLenObjectBuilder {
 
+    private final char[] tmpCharsForCodePointConversion = new char[2];
+
     public void appendChar(char ch) throws IOException {
         UTF8StringUtil.writeCharAsModifiedUTF8(ch, out);
     }
 
+    public void appendCodePoint(int codePoint) throws IOException {
+        int numChar = Character.toChars(codePoint, tmpCharsForCodePointConversion, 0);
+        for (int i = 0; i < numChar; i++) {
+            appendChar(tmpCharsForCodePointConversion[i]);
+        }
+    }
+
     public void appendString(String string) throws IOException {
         for (int i = 0; i < string.length(); i++) {
             appendChar(string.charAt(i));
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
index 302e7a0..fa93003 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
@@ -20,10 +20,14 @@
 package org.apache.hyracks.data.std.primitive;
 
 import static org.apache.hyracks.data.std.primitive.UTF8StringPointable.generateUTF8Pointable;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_EMOJI_FAMILY_OF_2;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_EMOJI_FAMILY_OF_4;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
+import java.io.IOException;
+
 import org.apache.commons.lang3.CharSet;
 import org.apache.hyracks.data.std.util.GrowableArray;
 import org.apache.hyracks.data.std.util.UTF8StringBuilder;
@@ -40,6 +44,11 @@ public class UTF8StringPointableTest {
     public static UTF8StringPointable STRING_LEN_127 = generateUTF8Pointable(UTF8StringSample.STRING_LEN_127);
     public static UTF8StringPointable STRING_LEN_128 = generateUTF8Pointable(UTF8StringSample.STRING_LEN_128);
 
+    public static UTF8StringPointable STRING_POINTABLE_EMOJI_FAMILY_OF_4 =
+            generateUTF8Pointable(STRING_EMOJI_FAMILY_OF_4);
+    public static UTF8StringPointable STRING_POINTABLE_EMOJI_FAMILY_OF_2 =
+            generateUTF8Pointable(STRING_EMOJI_FAMILY_OF_2);
+
     @Test
     public void testGetStringLength() throws Exception {
         UTF8StringPointable utf8Ptr = generateUTF8Pointable(UTF8StringSample.STRING_LEN_127);
@@ -119,6 +128,19 @@ public class UTF8StringPointableTest {
     }
 
     @Test
+    public void testSubstrWithMultiCodePointCharacter() throws IOException {
+        GrowableArray storage = new GrowableArray();
+        UTF8StringBuilder builder = new UTF8StringBuilder();
+
+        // The middle 2 people of the 4-people-family is a 2-people-family
+        STRING_POINTABLE_EMOJI_FAMILY_OF_4.substr(2, 3, builder, storage);
+        UTF8StringPointable result = new UTF8StringPointable();
+        result.set(storage.getByteArray(), 0, storage.getLength());
+
+        assertEquals(0, STRING_POINTABLE_EMOJI_FAMILY_OF_2.compareTo(result));
+    }
+
+    @Test
     public void testSubstrBefore() throws Exception {
         UTF8StringBuilder builder = new UTF8StringBuilder();
         GrowableArray storage = new GrowableArray();
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 2b0e49e..c3ee97c 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -90,6 +90,50 @@ public class UTF8StringUtil {
         }
     }
 
+    public static int codePointAt(byte[] b, int s) {
+        char c1 = charAt(b, s);
+
+        if (Character.isLowSurrogate(c1)) {
+            // In this case, the index s doesn't point to a correct position
+            throw new IllegalArgumentException("decoding error: got a low surrogate without a high surrogate");
+        }
+
+        if (Character.isHighSurrogate(c1)) {
+            // If c1 is the a high surrogate and also the last char in the byte array (that means the byte array is somehow illegal),
+            // then an exception will be thrown because there is no low surrogate (c2) available in the byte array
+            s += charSize(b, s);
+            char c2 = charAt(b, s);
+            if (Character.isLowSurrogate(c2)) {
+                return Character.toCodePoint(c1, c2);
+            } else {
+                throw new IllegalArgumentException(
+                        "decoding error: the high surrogate is not followed by a low surrogate");
+            }
+        }
+
+        return c1;
+    }
+
+    public static int codePointSize(byte[] b, int s) {
+        char c1 = charAt(b, s);
+        int size1 = charSize(b, s);
+
+        if (Character.isLowSurrogate(c1)) {
+            throw new IllegalArgumentException("decoding error: got a low surrogate without a high surrogate");
+        }
+
+        if (Character.isHighSurrogate(c1)) {
+            // Similar to the above codePointAt(),
+            // if c1 is the a high surrogate and also the last char in the byte array (that means the byte array is somehow illegal),
+            // then an exception will be thrown because there is no low surrogate available in the byte array
+            s += charSize(b, s);
+            int size2 = charSize(b, s);
+            return size1 + size2;
+        }
+
+        return size1;
+    }
+
     public static boolean isCharStart(byte[] b, int s) {
         int c = b[s] & 0xff;
         return (c >> 6) != 2;
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
index 6984d75..1502f25 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
@@ -39,6 +39,9 @@ public class UTF8StringSample {
     public static final String STRING_UTF8_MIX_LOWERCASE = "\uD841\uDF0E\uD841\uDF31้”Ÿxๆ–คyๆ‹ทzร ";
     public static final String STRING_NEEDS_2_JAVA_CHARS_1 = "\uD83D\uDE22\uD83D\uDE22\uD83D\uDC89\uD83D\uDC89";
     public static final String STRING_NEEDS_2_JAVA_CHARS_2 = "๐Ÿ˜ข๐Ÿ˜ข๐Ÿ’‰๐Ÿ’‰";
+    public static final String STRING_EMOJI_FAMILY_OF_4 =
+            "\uD83D\uDC68\u200D\uD83D\uDC68\u200D\uD83D\uDC66\u200D\uD83D\uDC66";
+    public static final String STRING_EMOJI_FAMILY_OF_2 = "\uD83D\uDC68\u200D\uD83D\uDC66";
 
     public static final String STRING_LEN_127 = generateStringRepeatBy(ONE_ASCII_CHAR, 127);
     public static final String STRING_LEN_128 = generateStringRepeatBy(ONE_ASCII_CHAR, 128);