You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by im...@apache.org on 2020/04/09 19:02:57 UTC
[asterixdb] branch master updated: [NO ISSUE] Add
UTF8StringUtil.getUTF8StringInArray() for tokenizer scenario
This is an automated email from the ASF dual-hosted git repository.
imaxon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git
The following commit(s) were added to refs/heads/master by this push:
new 38f652a [NO ISSUE] Add UTF8StringUtil.getUTF8StringInArray() for tokenizer scenario
38f652a is described below
commit 38f652a0dc144d0064f54c0505968eddc362e45b
Author: Rui Guo <ru...@uci.edu>
AuthorDate: Fri Apr 3 22:04:01 2020 -0700
[NO ISSUE] Add UTF8StringUtil.getUTF8StringInArray() for tokenizer scenario
Change-Id: I273a776f14a2846e5380f2bdc4a3168a1dac052c
Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/5565
Contrib: Jenkins <je...@fulliautomatix.ics.uci.edu>
Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
Reviewed-by: Ian Maxon <im...@uci.edu>
Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
---
.../apache/hyracks/util/string/UTF8StringUtil.java | 14 +++++++++++++
.../hyracks/util/string/UTF8StringUtilTest.java | 23 ++++++++++++++++++++++
2 files changed, 37 insertions(+)
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index f50fa90..2b0e49e 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -324,6 +324,20 @@ public class UTF8StringUtil {
return builder;
}
+ // Different from the above toString() methods, here we assume the byte[] doesn't contain NumBytesToStoreLength
+ // In fact, this is used for string tokenizer: get "hello" and "world" from the bytes of "hello world"
+ public static String getUTF8StringInArray(byte[] b, int start, int len) {
+ StringBuilder builder = new StringBuilder();
+
+ for (int i = start; i < start + len;) {
+ char c = UTF8StringUtil.charAt(b, i);
+ builder.append(c);
+ i += UTF8StringUtil.charSize(b, i);
+ }
+
+ return builder.toString();
+ }
+
public static void printUTF8StringWithQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
printUTF8String(b, s, l, os, true);
}
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
index b75d68c..2c99104 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
@@ -31,6 +31,7 @@ import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
import static org.apache.hyracks.util.string.UTF8StringUtil.getModifiedUTF8Len;
import static org.apache.hyracks.util.string.UTF8StringUtil.getNumBytesToStoreLength;
import static org.apache.hyracks.util.string.UTF8StringUtil.getStringLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getUTF8StringInArray;
import static org.apache.hyracks.util.string.UTF8StringUtil.getUTFLength;
import static org.apache.hyracks.util.string.UTF8StringUtil.hash;
import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseCompareTo;
@@ -42,6 +43,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
+import java.util.List;
import org.junit.Test;
@@ -153,4 +155,25 @@ public class UTF8StringUtilTest {
assertTrue(familyOne != familyTwo);
}
+ @Test
+ public void testGetUTF8StringInArray() {
+ String str = null;
+ byte[] bytes = null;
+ List<String> answer = null;
+
+ str = "database group at university of California, Irvine 23333";
+ bytes = writeStringToBytes(str);
+ // First byte in bytes is for the number of bytes of the entire string,
+ // and it should be skipped in getUTF8StringInArray
+ assertEquals("database", getUTF8StringInArray(bytes, 1, 8));
+ assertEquals("at", getUTF8StringInArray(bytes, 16, 2));
+ // test upper case
+ assertEquals("California", getUTF8StringInArray(bytes, 33, 10));
+ // test non-english char
+ assertEquals(",", getUTF8StringInArray(bytes, 43, 1));
+ assertEquals("Irvine", getUTF8StringInArray(bytes, 45, 6));
+ // test number
+ assertEquals("23333", getUTF8StringInArray(bytes, 52, 5));
+ }
+
}