You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by im...@apache.org on 2020/04/09 19:02:57 UTC

[asterixdb] branch master updated: [NO ISSUE] Add UTF8StringUtil.getUTF8StringInArray() for tokenizer scenario

This is an automated email from the ASF dual-hosted git repository.

imaxon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git


The following commit(s) were added to refs/heads/master by this push:
     new 38f652a  [NO ISSUE] Add UTF8StringUtil.getUTF8StringInArray() for tokenizer scenario
38f652a is described below

commit 38f652a0dc144d0064f54c0505968eddc362e45b
Author: Rui Guo <ru...@uci.edu>
AuthorDate: Fri Apr 3 22:04:01 2020 -0700

    [NO ISSUE] Add UTF8StringUtil.getUTF8StringInArray() for tokenizer scenario
    
    Change-Id: I273a776f14a2846e5380f2bdc4a3168a1dac052c
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/5565
    Contrib: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Ian Maxon <im...@uci.edu>
    Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
---
 .../apache/hyracks/util/string/UTF8StringUtil.java | 14 +++++++++++++
 .../hyracks/util/string/UTF8StringUtilTest.java    | 23 ++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index f50fa90..2b0e49e 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -324,6 +324,20 @@ public class UTF8StringUtil {
         return builder;
     }
 
+    // Different from the above toString() methods, here we assume the byte[] doesn't contain NumBytesToStoreLength
+    // In fact, this is used for string tokenizer: get "hello" and "world" from the bytes of "hello world"
+    public static String getUTF8StringInArray(byte[] b, int start, int len) {
+        StringBuilder builder = new StringBuilder();
+
+        for (int i = start; i < start + len;) {
+            char c = UTF8StringUtil.charAt(b, i);
+            builder.append(c);
+            i += UTF8StringUtil.charSize(b, i);
+        }
+
+        return builder.toString();
+    }
+
     public static void printUTF8StringWithQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
         printUTF8String(b, s, l, os, true);
     }
diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
index b75d68c..2c99104 100644
--- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
@@ -31,6 +31,7 @@ import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
 import static org.apache.hyracks.util.string.UTF8StringUtil.getModifiedUTF8Len;
 import static org.apache.hyracks.util.string.UTF8StringUtil.getNumBytesToStoreLength;
 import static org.apache.hyracks.util.string.UTF8StringUtil.getStringLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getUTF8StringInArray;
 import static org.apache.hyracks.util.string.UTF8StringUtil.getUTFLength;
 import static org.apache.hyracks.util.string.UTF8StringUtil.hash;
 import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseCompareTo;
@@ -42,6 +43,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
+import java.util.List;
 
 import org.junit.Test;
 
@@ -153,4 +155,25 @@ public class UTF8StringUtilTest {
         assertTrue(familyOne != familyTwo);
     }
 
+    @Test
+    public void testGetUTF8StringInArray() {
+        String str = null;
+        byte[] bytes = null;
+        List<String> answer = null;
+
+        str = "database group at university of California, Irvine 23333";
+        bytes = writeStringToBytes(str);
+        // First byte in bytes is for the number of bytes of the entire string,
+        // and it should be skipped in getUTF8StringInArray
+        assertEquals("database", getUTF8StringInArray(bytes, 1, 8));
+        assertEquals("at", getUTF8StringInArray(bytes, 16, 2));
+        // test upper case
+        assertEquals("California", getUTF8StringInArray(bytes, 33, 10));
+        // test non-english char
+        assertEquals(",", getUTF8StringInArray(bytes, 43, 1));
+        assertEquals("Irvine", getUTF8StringInArray(bytes, 45, 6));
+        // test number
+        assertEquals("23333", getUTF8StringInArray(bytes, 52, 5));
+    }
+
 }