You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by dl...@apache.org on 2020/07/24 00:12:29 UTC
[asterixdb] branch master updated: [ASTERIXDB-2762] reverse() per
code point
This is an automated email from the ASF dual-hosted git repository.
dlych pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git
The following commit(s) were added to refs/heads/master by this push:
new 75a2cca [ASTERIXDB-2762] reverse() per code point
75a2cca is described below
commit 75a2cca2aabed2cb02586c9c8202de5fbfacd36f
Author: Rui Guo <ru...@uci.edu>
AuthorDate: Thu Jul 23 11:44:20 2020 -0700
[ASTERIXDB-2762] reverse() per code point
This commit aims to reverse a string per code point instead of per Java
char in the reverse() function.
Change-Id: I437903b8bc668c836e781f4a965e6039305b8654
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7303
Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
Reviewed-by: Dmitry Lychagin <dm...@couchbase.com>
---
.../string/reverse/reverse.1.query.sqlpp | 5 +++--
.../runtimets/results/string/reverse/reverse.1.adm | 2 +-
.../src/main/markdown/builtins/2_string_common.md | 12 +++++++++-
.../data/std/primitive/UTF8StringPointable.java | 20 +++++++++++++++++
.../std/primitive/UTF8StringPointableTest.java | 26 ++++++++++++++++++++++
5 files changed, 61 insertions(+), 4 deletions(-)
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp
index e127372..f450868 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/reverse/reverse.1.query.sqlpp
@@ -22,5 +22,6 @@
"t2": reverse(""),
"t3": reverse("abcd"),
"t4": string_to_codepoint(reverse("a\u00D7\u2103\u00F7\u2109b")),
- "t5": ( from ["ab", "abc", "abcd"] t select value reverse(t) order by t )
- }
\ No newline at end of file
+ "t5": ( from ["ab", "abc", "abcd"] t select value reverse(t) order by t ),
+ "t6": reverse("🇨🇳")
+ };
\ No newline at end of file
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm
index a2b8b2c..d0669c4 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/reverse/reverse.1.adm
@@ -1 +1 @@
-{ "t1": [ true, true ], "t2": "", "t3": "dcba", "t4": [ 98, 8457, 247, 8451, 215, 97 ], "t5": [ "ba", "cba", "dcba" ] }
\ No newline at end of file
+{ "t1": [ true, true ], "t2": "", "t3": "dcba", "t4": [ 98, 8457, 247, 8451, 215, 97 ], "t5": [ "ba", "cba", "dcba" ], "t6": "🇳🇨" }
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
index 1c713b0..b7a1aca 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
@@ -460,6 +460,8 @@
reverse(string)
* Returns a string formed by reversing characters in the input `string`.
+ For characters of multiple code points, code point is the minimal unit to reverse.
+ See the following examples for more details.
* Arguments:
* `string` : a `string` to be reversed
* Return Value:
@@ -473,11 +475,19 @@
reverse("hello");
-
* The expected result is:
"olleh"
+* Example of multi-code-point character (Korean):
+
+ reverse("한글");
+
+* The expected result is
+ (the Korean characters are splitted into code points and then the code points are reversed):
+
+ "ᆯᅳᄀᆫᅡᄒ"
+
### rtrim ###
* Syntax:
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 3b1f18b..9a38a4e 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -656,7 +656,27 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
int srcEnd = srcPtr.getStartOffset() + srcPtr.getLength() - 1;
for (int cursorIndex = srcEnd; cursorIndex >= srcStart; cursorIndex--) {
if (UTF8StringUtil.isCharStart(srcPtr.bytes, cursorIndex)) {
+ char ch = UTF8StringUtil.charAt(srcPtr.bytes, cursorIndex);
int charSize = UTF8StringUtil.charSize(srcPtr.bytes, cursorIndex);
+
+ if (Character.isLowSurrogate(ch)) {
+ while (cursorIndex >= srcStart) {
+ cursorIndex--;
+ if (UTF8StringUtil.isCharStart(srcPtr.bytes, cursorIndex)) {
+ ch = UTF8StringUtil.charAt(srcPtr.bytes, cursorIndex);
+ if (Character.isHighSurrogate(ch) == false) {
+ throw new IllegalArgumentException(
+ "Decoding Error: no corresponding high surrogate found for the following low surrogate");
+ }
+
+ charSize += UTF8StringUtil.charSize(srcPtr.bytes, cursorIndex);
+ break;
+ }
+ }
+ } else if (Character.isHighSurrogate(ch)) {
+ throw new IllegalArgumentException("Decoding Error: get a high surrogate without low surrogate");
+ }
+
builder.appendUtf8StringPointable(srcPtr, cursorIndex, charSize);
}
}
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
index 22be7ca..387bc03 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
@@ -325,4 +325,30 @@ public class UTF8StringPointableTest {
assertEquals(0, expected.compareTo(result));
}
+ @Test
+ public void testReverse() throws Exception {
+ UTF8StringBuilder builder = new UTF8StringBuilder();
+ GrowableArray storage = new GrowableArray();
+ UTF8StringPointable result = new UTF8StringPointable();
+ UTF8StringPointable input = generateUTF8Pointable(" I'd like to reverse ");
+ UTF8StringPointable expected = generateUTF8Pointable(" esrever ot ekil d'I ");
+
+ UTF8StringPointable.reverse(input, builder, storage);
+ result.set(storage.getByteArray(), 0, storage.getLength());
+ assertEquals(0, expected.compareTo(result));
+ }
+
+ @Test
+ public void testReverseWithEmoji() throws IOException {
+ UTF8StringBuilder builder = new UTF8StringBuilder();
+ GrowableArray storage = new GrowableArray();
+ UTF8StringPointable result = new UTF8StringPointable();
+ UTF8StringPointable input = generateUTF8Pointable("\uD83C\uDDE8\uD83C\uDDF3"); // CN flag
+ UTF8StringPointable expected = generateUTF8Pointable("\uD83C\uDDF3\uD83C\uDDE8"); // NC flag
+
+ UTF8StringPointable.reverse(input, builder, storage);
+ result.set(storage.getByteArray(), 0, storage.getLength());
+ assertEquals(0, expected.compareTo(result));
+ }
+
}