You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by dl...@apache.org on 2020/07/24 00:12:29 UTC

[asterixdb] branch master updated: [ASTERIXDB-2762] reverse() per code point

This is an automated email from the ASF dual-hosted git repository.

dlych pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git


The following commit(s) were added to refs/heads/master by this push:
     new 75a2cca  [ASTERIXDB-2762] reverse() per code point
75a2cca is described below

commit 75a2cca2aabed2cb02586c9c8202de5fbfacd36f
Author: Rui Guo <ru...@uci.edu>
AuthorDate: Thu Jul 23 11:44:20 2020 -0700

    [ASTERIXDB-2762] reverse() per code point
    
    This commit aims to reverse a string per code point instead of per Java
    char in the reverse() function.
    
    Change-Id: I437903b8bc668c836e781f4a965e6039305b8654
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7303
    Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Dmitry Lychagin <dm...@couchbase.com>
---
 .../string/reverse/reverse.1.query.sqlpp           |  5 +++--
 .../runtimets/results/string/reverse/reverse.1.adm |  2 +-
 .../src/main/markdown/builtins/2_string_common.md  | 12 +++++++++-
 .../data/std/primitive/UTF8StringPointable.java    | 20 +++++++++++++++++
 .../std/primitive/UTF8StringPointableTest.java     | 26 ++++++++++++++++++++++
 5 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp
index e127372..f450868 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp
@@ -22,5 +22,6 @@
    "t2": reverse(""),
    "t3": reverse("abcd"),
    "t4": string_to_codepoint(reverse("a\u00D7\u2103\u00F7\u2109b")),
-   "t5": ( from ["ab", "abc", "abcd"] t select value reverse(t) order by t )
- }
\ No newline at end of file
+   "t5": ( from ["ab", "abc", "abcd"] t select value reverse(t) order by t ),
+   "t6": reverse("🇨🇳")
+ };
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm
index a2b8b2c..d0669c4 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm
@@ -1 +1 @@
-{ "t1": [ true, true ], "t2": "", "t3": "dcba", "t4": [ 98, 8457, 247, 8451, 215, 97 ], "t5": [ "ba", "cba", "dcba" ] }
\ No newline at end of file
+{ "t1": [ true, true ], "t2": "", "t3": "dcba", "t4": [ 98, 8457, 247, 8451, 215, 97 ], "t5": [ "ba", "cba", "dcba" ], "t6": "🇳🇨" }
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
index 1c713b0..b7a1aca 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
@@ -460,6 +460,8 @@
         reverse(string)
 
  * Returns a string formed by reversing characters in the input `string`.
+ For characters of multiple code points, code point is the minimal unit to reverse.
+ See the following examples for more details.
  * Arguments:
     * `string` : a `string` to be reversed
  * Return Value:
@@ -473,11 +475,19 @@
 
         reverse("hello");
 
-
  * The expected result is:
 
         "olleh"
 
+* Example of multi-code-point character (Korean):
+
+        reverse("한글");
+
+* The expected result is
+ (the Korean characters are splitted into code points and then the code points are reversed):
+
+        "ᆯᅳᄀᆫᅡᄒ"
+
 
 ### rtrim ###
  * Syntax:
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 3b1f18b..9a38a4e 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -656,7 +656,27 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
         int srcEnd = srcPtr.getStartOffset() + srcPtr.getLength() - 1;
         for (int cursorIndex = srcEnd; cursorIndex >= srcStart; cursorIndex--) {
             if (UTF8StringUtil.isCharStart(srcPtr.bytes, cursorIndex)) {
+                char ch = UTF8StringUtil.charAt(srcPtr.bytes, cursorIndex);
                 int charSize = UTF8StringUtil.charSize(srcPtr.bytes, cursorIndex);
+
+                if (Character.isLowSurrogate(ch)) {
+                    while (cursorIndex >= srcStart) {
+                        cursorIndex--;
+                        if (UTF8StringUtil.isCharStart(srcPtr.bytes, cursorIndex)) {
+                            ch = UTF8StringUtil.charAt(srcPtr.bytes, cursorIndex);
+                            if (Character.isHighSurrogate(ch) == false) {
+                                throw new IllegalArgumentException(
+                                        "Decoding Error: no corresponding high surrogate found for the following low surrogate");
+                            }
+
+                            charSize += UTF8StringUtil.charSize(srcPtr.bytes, cursorIndex);
+                            break;
+                        }
+                    }
+                } else if (Character.isHighSurrogate(ch)) {
+                    throw new IllegalArgumentException("Decoding Error: get a high surrogate without low surrogate");
+                }
+
                 builder.appendUtf8StringPointable(srcPtr, cursorIndex, charSize);
             }
         }
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
index 22be7ca..387bc03 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
@@ -325,4 +325,30 @@ public class UTF8StringPointableTest {
         assertEquals(0, expected.compareTo(result));
     }
 
+    @Test
+    public void testReverse() throws Exception {
+        UTF8StringBuilder builder = new UTF8StringBuilder();
+        GrowableArray storage = new GrowableArray();
+        UTF8StringPointable result = new UTF8StringPointable();
+        UTF8StringPointable input = generateUTF8Pointable(" I'd like to reverse ");
+        UTF8StringPointable expected = generateUTF8Pointable(" esrever ot ekil d'I ");
+
+        UTF8StringPointable.reverse(input, builder, storage);
+        result.set(storage.getByteArray(), 0, storage.getLength());
+        assertEquals(0, expected.compareTo(result));
+    }
+
+    @Test
+    public void testReverseWithEmoji() throws IOException {
+        UTF8StringBuilder builder = new UTF8StringBuilder();
+        GrowableArray storage = new GrowableArray();
+        UTF8StringPointable result = new UTF8StringPointable();
+        UTF8StringPointable input = generateUTF8Pointable("\uD83C\uDDE8\uD83C\uDDF3"); // CN flag
+        UTF8StringPointable expected = generateUTF8Pointable("\uD83C\uDDF3\uD83C\uDDE8"); // NC flag
+
+        UTF8StringPointable.reverse(input, builder, storage);
+        result.set(storage.getByteArray(), 0, storage.getLength());
+        assertEquals(0, expected.compareTo(result));
+    }
+
 }