You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by dl...@apache.org on 2020/07/23 17:35:03 UTC

[asterixdb] branch master updated: [ASTERIXDB-2762] Use code point as the unit in trim()

This is an automated email from the ASF dual-hosted git repository.

dlych pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/asterixdb.git


The following commit(s) were added to refs/heads/master by this push:
     new 4ce394b  [ASTERIXDB-2762] Use code point as the unit in trim()
4ce394b is described below

commit 4ce394b6a1ccce77d3d76052812a90d09e192e11
Author: Rui Guo <ru...@uci.edu>
AuthorDate: Thu Jul 23 07:35:59 2020 -0700

    [ASTERIXDB-2762] Use code point as the unit in trim()
    
    This commit aims to use code point as the unit in trim().
    
    Currently, Java char (2 bytes) is used as the unit in trim(),
    however, for non-English characters such as Emoji and Korean,
    one character may have multiple bytes and thus can be trimmed
    in an illegal way if we use Java char as the unit.
    Instead, code point is a more natural unit to do so.
    
    Change-Id: If14092be9c2a654dba392bb2b773db81c9e47ae6
    Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/7283
    Integration-Tests: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
    Reviewed-by: Dmitry Lychagin <dm...@couchbase.com>
---
 .../queries_sqlpp/string/trim/trim.1.query.sqlpp   |  6 +-
 .../runtimets/results/string/trim/trim.1.adm       |  2 +-
 .../test/resources/runtimets/testsuite_sqlpp.xml   | 20 +++---
 .../src/main/markdown/builtins/2_string_common.md  | 49 +++++++++++++--
 .../functions/StringLTrimDescriptor.java           |  3 +-
 .../functions/StringRTrimDescriptor.java           |  3 +-
 .../evaluators/functions/StringTrimDescriptor.java |  3 +-
 .../evaluators/functions/utils/StringTrimmer.java  | 19 +++---
 .../hyracks/hyracks-data/hyracks-data-std/pom.xml  |  8 +--
 .../data/std/primitive/UTF8StringPointable.java    | 72 +++++++++++++++-------
 .../std/primitive/UTF8StringPointableTest.java     | 60 +++++++++++++++---
 hyracks-fullstack/pom.xml                          |  5 ++
 12 files changed, 188 insertions(+), 62 deletions(-)

diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/trim/trim.1.query.sqlpp b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/trim/trim.1.query.sqlpp
index 61dc619..82326c4 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/trim/trim.1.query.sqlpp
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/queries_sqlpp/string/trim/trim.1.query.sqlpp
@@ -33,5 +33,9 @@
    trim(null, null),
    trim("abc", missing),
    trim(missing, "abc"),
-   trim(missing, missing)
+   trim(missing, missing),
+   trim("πŸ‡ΊπŸ‡Έ", "πŸ‡Ί"),
+   trim("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", "πŸ‘¨β€πŸ‘¦"),
+   ltrim("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", "πŸ‘¨β€πŸ‘¦"),
+   rtrim("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", "πŸ‘¨β€πŸ‘¦")
 ];
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/trim/trim.1.adm b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/trim/trim.1.adm
index 410cb00..50ab5d0 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/results/string/trim/trim.1.adm
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/results/string/trim/trim.1.adm
@@ -1 +1 @@
-[ "", "", "abc", "abcd", null, null, "", "", "bc", "bc", null, null, null, null, null, null ]
+[ "", "", "abc", "abcd", null, null, "", "", "bc", "bc", null, null, null, null, null, null, "πŸ‡Έ", "πŸ‘©β€πŸ‘§", "πŸ‘©β€πŸ‘§β€πŸ‘¦", "πŸ‘¨β€πŸ‘©β€πŸ‘§" ]
diff --git a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
index 8f9e1e8..94d0335 100644
--- a/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
+++ b/asterixdb/asterix-app/src/test/resources/runtimets/testsuite_sqlpp.xml
@@ -9408,11 +9408,6 @@
       </compilation-unit>
     </test-case>
     <test-case FilePath="string">
-      <compilation-unit name="ltrim">
-        <output-dir compare="Text">ltrim</output-dir>
-      </compilation-unit>
-    </test-case>
-    <test-case FilePath="string">
       <compilation-unit name="matches02">
         <output-dir compare="Text">matches02</output-dir>
       </compilation-unit>
@@ -9768,11 +9763,6 @@
       </compilation-unit>
     </test-case>
     <test-case FilePath="string">
-      <compilation-unit name="rtrim">
-        <output-dir compare="Text">rtrim</output-dir>
-      </compilation-unit>
-    </test-case>
-    <test-case FilePath="string">
       <compilation-unit name="split">
         <output-dir compare="Text">split</output-dir>
       </compilation-unit>
@@ -10023,6 +10013,16 @@
       </compilation-unit>
     </test-case>
     <test-case FilePath="string">
+      <compilation-unit name="ltrim">
+        <output-dir compare="Text">ltrim</output-dir>
+      </compilation-unit>
+    </test-case>
+    <test-case FilePath="string">
+      <compilation-unit name="rtrim">
+        <output-dir compare="Text">rtrim</output-dir>
+      </compilation-unit>
+    </test-case>
+    <test-case FilePath="string">
       <compilation-unit name="uppercase">
         <output-dir compare="Text">uppercase</output-dir>
       </compilation-unit>
diff --git a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
index 8a98690..1c713b0 100644
--- a/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
+++ b/asterixdb/asterix-doc/src/main/markdown/builtins/2_string_common.md
@@ -181,6 +181,10 @@
 
  * Returns a new string with all leading characters that appear in `chars` removed.
    By default, white space is the character to trim.
+   Note that here one character means one code point.
+   For example, the emoji 4-people-family notation "πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘¦" contains 7 code points,
+   and it is possible to trim a few code points (such as a 2-people-family "πŸ‘¨β€πŸ‘¦") from it.
+   See the following example for more details.
  * Arguments:
     * `string` : a `string` to be trimmed,
     * `chars` : a `string` that contains characters that are used to trim.
@@ -189,17 +193,24 @@
     * `missing` if any argument is a `missing` value,
     * `null` if any argument is a `null` value but no argument is a `missing` value,
     * any other non-string input value will cause a type error.
-
+ * Related functions: see `trim()`, `rtrim()`
 
  * Example:
 
         ltrim("me like x-phone", "eml");
 
-
  * The expected result is:
 
         " like x-phone"
 
+ * Example with multi-codepoint notation (trim the man and boy from the family of man, woman, girl and boy):
+
+        ltrim("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", "πŸ‘¨β€πŸ‘¦")
+
+ * The expected result is (only woman, girl and boy are left in the family):
+
+        "πŸ‘©β€πŸ‘§β€πŸ‘¦"
+
 
 ### position ###
  * Syntax:
@@ -467,6 +478,7 @@
 
         "olleh"
 
+
 ### rtrim ###
  * Syntax:
 
@@ -474,6 +486,10 @@
 
  * Returns a new string with all trailing characters that appear in `chars` removed.
    By default, white space is the character to trim.
+   Note that here one character means one code point.
+   For example, the emoji 4-people-family notation "πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘¦" contains 7 code points,
+   and it is possible to trim a few code points (such as a 2-people-family "πŸ‘¨β€πŸ‘¦") from it.
+   See the following example for more details.
  * Arguments:
     * `string` : a `string` to be trimmed,
     * `chars` : a `string` that contains characters that are used to trim.
@@ -482,7 +498,7 @@
     * `missing` if any argument is a `missing` value,
     * `null` if any argument is a `null` value but no argument is a `missing` value,
     * any other non-string input value will cause a type error.
-
+ * Related functions: see `trim()`, `ltrim()`
 
  * Example:
 
@@ -493,7 +509,16 @@
 
  * The expected result is:
 
-        { "v1": "i like ", "v2": "i like " }
+        { "v1": "i like ", "v2": "i like x-" }
+
+ * Example with multi-codepoint notation (trim the man and boy from the family of man, woman, girl and boy):
+
+        rtrim("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", "πŸ‘¨β€πŸ‘¦")
+
+ * The expected result is (only man, woman and girl are left in the family):
+
+        "πŸ‘¨β€πŸ‘©β€πŸ‘§"
+
 
 ### split ###
  * Syntax:
@@ -605,8 +630,12 @@ The function has an alias `substring`.
 
         trim(string[, chars]);
 
- * Returns a new string with all leading characters that appear in `chars` removed.
+ * Returns a new string with all leading and trailing characters that appear in `chars` removed.
    By default, white space is the character to trim.
+   Note that here one character means one code point.
+   For example, the emoji 4-people-family notation "πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘¦" contains 7 code points,
+   and it is possible to trim a few code points (such as a 2-people-family "πŸ‘¨β€πŸ‘¦") from it.
+   See the following example for more details.
  * Arguments:
     * `string` : a `string` to be trimmed,
     * `chars` : a `string` that contains characters that are used to trim.
@@ -615,17 +644,25 @@ The function has an alias `substring`.
     * `missing` if any argument is a `missing` value,
     * `null` if any argument is a `null` value but no argument is a `missing` value,
     * any other non-string input value will cause a type error.
+ * Related functions: see `ltrim()`, `rtrim()`
 
 
  * Example:
 
         trim("i like x-phone", "xphoen");
 
-
  * The expected result is:
 
         " like "
 
+ * Example with multi-codepoint notation (trim the man and boy from the family of man, woman, girl and boy):
+
+       trim("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", "πŸ‘¨β€πŸ‘¦")
+
+ * The expected result is (only woman and girl are left in the family):
+
+         "πŸ‘©β€πŸ‘§"
+
 
 ### upper ###
  * Syntax:
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLTrimDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLTrimDescriptor.java
index 6e4f476..4d4fd93 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLTrimDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringLTrimDescriptor.java
@@ -60,7 +60,8 @@ public class StringLTrimDescriptor extends AbstractScalarFunctionDynamicDescript
             public IScalarEvaluator createScalarEvaluator(IEvaluatorContext ctx) throws HyracksDataException {
                 return new AbstractUnaryStringStringEval(ctx, args[0], StringLTrimDescriptor.this.getIdentifier(),
                         sourceLoc) {
-                    private StringTrimmer stringTrimmer = new StringTrimmer(resultBuilder, resultArray, " ");
+                    private StringTrimmer stringTrimmer =
+                            new StringTrimmer(resultBuilder, resultArray, UTF8StringPointable.SPACE_STRING_POINTABLE);
 
                     @Override
                     protected void process(UTF8StringPointable srcPtr, IPointable resultStrPtr) throws IOException {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringRTrimDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringRTrimDescriptor.java
index f74b641..e99c401 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringRTrimDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringRTrimDescriptor.java
@@ -60,7 +60,8 @@ public class StringRTrimDescriptor extends AbstractScalarFunctionDynamicDescript
             public IScalarEvaluator createScalarEvaluator(IEvaluatorContext ctx) throws HyracksDataException {
                 return new AbstractUnaryStringStringEval(ctx, args[0], StringRTrimDescriptor.this.getIdentifier(),
                         sourceLoc) {
-                    private StringTrimmer stringTrimmer = new StringTrimmer(resultBuilder, resultArray, " ");
+                    private StringTrimmer stringTrimmer =
+                            new StringTrimmer(resultBuilder, resultArray, UTF8StringPointable.SPACE_STRING_POINTABLE);
 
                     @Override
                     protected void process(UTF8StringPointable srcPtr, IPointable resultStrPtr) throws IOException {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringTrimDescriptor.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringTrimDescriptor.java
index 704bf7d..3f0ec52 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringTrimDescriptor.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/StringTrimDescriptor.java
@@ -60,7 +60,8 @@ public class StringTrimDescriptor extends AbstractScalarFunctionDynamicDescripto
             public IScalarEvaluator createScalarEvaluator(IEvaluatorContext ctx) throws HyracksDataException {
                 return new AbstractUnaryStringStringEval(ctx, args[0], StringTrimDescriptor.this.getIdentifier(),
                         sourceLoc) {
-                    private StringTrimmer stringTrimmer = new StringTrimmer(resultBuilder, resultArray, " ");
+                    private StringTrimmer stringTrimmer =
+                            new StringTrimmer(resultBuilder, resultArray, UTF8StringPointable.SPACE_STRING_POINTABLE);
 
                     @Override
                     protected void process(UTF8StringPointable srcPtr, IPointable resultStrPtr) throws IOException {
diff --git a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/utils/StringTrimmer.java b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/utils/StringTrimmer.java
index 3e41b1b..8dc41f5 100644
--- a/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/utils/StringTrimmer.java
+++ b/asterixdb/asterix-runtime/src/main/java/org/apache/asterix/runtime/evaluators/functions/utils/StringTrimmer.java
@@ -22,13 +22,15 @@ package org.apache.asterix.runtime.evaluators.functions.utils;
 import java.io.IOException;
 
 import org.apache.asterix.runtime.evaluators.functions.StringEvaluatorUtils;
-import org.apache.commons.lang3.CharSet;
 import org.apache.hyracks.data.std.api.IPointable;
 import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
 import org.apache.hyracks.data.std.util.ByteArrayAccessibleOutputStream;
 import org.apache.hyracks.data.std.util.GrowableArray;
 import org.apache.hyracks.data.std.util.UTF8StringBuilder;
 
+import it.unimi.dsi.fastutil.ints.IntArraySet;
+import it.unimi.dsi.fastutil.ints.IntSet;
+
 /**
  * A wrapper for string trim methods.
  */
@@ -37,7 +39,7 @@ public class StringTrimmer {
     // For the char set to trim.
     private final ByteArrayAccessibleOutputStream lastPatternStorage = new ByteArrayAccessibleOutputStream();
     private final UTF8StringPointable lastPatternPtr = new UTF8StringPointable();
-    private CharSet charSet;
+    private IntSet codePointSet = new IntArraySet();
 
     // For outputting the result.
     private final UTF8StringBuilder resultBuilder;
@@ -61,11 +63,12 @@ public class StringTrimmer {
      * @param pattern
      *            , the string that is used to construct the charset for trimming.
      */
-    public StringTrimmer(UTF8StringBuilder resultBuilder, GrowableArray resultArray, String pattern) {
+    public StringTrimmer(UTF8StringBuilder resultBuilder, GrowableArray resultArray, UTF8StringPointable pattern) {
         this.resultBuilder = resultBuilder;
         this.resultArray = resultArray;
         if (pattern != null) {
-            charSet = CharSet.getInstance(pattern);
+            codePointSet.clear();
+            pattern.getCodePoints(codePointSet);
         }
     }
 
@@ -76,10 +79,11 @@ public class StringTrimmer {
      *            , a pattern string.
      */
     public void build(UTF8StringPointable patternPtr) {
-        final boolean newPattern = charSet == null || lastPatternPtr.compareTo(patternPtr) != 0;
+        final boolean newPattern = (codePointSet.size() == 0) || lastPatternPtr.compareTo(patternPtr) != 0;
         if (newPattern) {
             StringEvaluatorUtils.copyResetUTF8Pointable(patternPtr, lastPatternStorage, lastPatternPtr);
-            charSet = CharSet.getInstance(patternPtr.toString());
+            codePointSet.clear();
+            patternPtr.getCodePoints(codePointSet);
         }
     }
 
@@ -98,7 +102,8 @@ public class StringTrimmer {
      */
     public void trim(UTF8StringPointable srcPtr, IPointable resultStrPtr, boolean left, boolean right)
             throws IOException {
-        UTF8StringPointable.trim(srcPtr, resultBuilder, resultArray, left, right, charSet);
+        srcPtr.trim(resultBuilder, resultArray, left, right, codePointSet);
         resultStrPtr.set(resultArray.getByteArray(), 0, resultArray.getLength());
     }
+
 }
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
index c27d884..df1f94e 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
@@ -32,10 +32,6 @@
   </properties>
   <dependencies>
     <dependency>
-      <groupId>org.apache.commons</groupId>
-      <artifactId>commons-lang3</artifactId>
-    </dependency>
-    <dependency>
       <groupId>org.apache.hyracks</groupId>
       <artifactId>hyracks-util</artifactId>
       <version>${project.version}</version>
@@ -56,5 +52,9 @@
       <groupId>com.fasterxml.jackson.core</groupId>
       <artifactId>jackson-databind</artifactId>
     </dependency>
+    <dependency>
+      <groupId>it.unimi.dsi</groupId>
+      <artifactId>fastutil</artifactId>
+    </dependency>
   </dependencies>
 </project>
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 944b317..3b1f18b 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -19,10 +19,8 @@
 package org.apache.hyracks.data.std.primitive;
 
 import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.nio.charset.StandardCharsets;
+import java.nio.charset.Charset;
 
-import org.apache.commons.lang3.CharSet;
 import org.apache.hyracks.api.dataflow.value.ITypeTraits;
 import org.apache.hyracks.api.exceptions.HyracksDataException;
 import org.apache.hyracks.api.io.IJsonSerializable;
@@ -38,6 +36,8 @@ import org.apache.hyracks.util.string.UTF8StringUtil;
 
 import com.fasterxml.jackson.databind.JsonNode;
 
+import it.unimi.dsi.fastutil.ints.IntCollection;
+
 public final class UTF8StringPointable extends AbstractPointable implements IHashable, IComparable {
 
     public static final UTF8StringPointableFactory FACTORY = new UTF8StringPointableFactory();
@@ -50,6 +50,9 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
     private int hashValue;
     private int stringLength;
 
+    public static final UTF8StringPointable SPACE_STRING_POINTABLE = generateUTF8Pointable(" ");
+    public static final Charset CESU8_CHARSET = Charset.forName("CESU8");
+
     /**
      * reset those meta length.
      * Since the {@code utf8Length} and the {@code metaLength} are often used, we compute those two values in advance.
@@ -122,6 +125,18 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
         return UTF8StringUtil.codePointSize(bytes, start + offset);
     }
 
+    public void getCodePoints(IntCollection codePointSet) {
+        int byteIdx = 0;
+        while (byteIdx < utf8Length) {
+            codePointSet.add(codePointAt(metaLength + byteIdx));
+            byteIdx += codePointSize(metaLength + byteIdx);
+        }
+
+        if (byteIdx != utf8Length) {
+            throw new IllegalArgumentException("Decoding error: malformed bytes");
+        }
+    }
+
     /**
      * Gets the length of the string in characters.
      * The first time call will need to go through the entire string, the following call will just return the pre-caculated result
@@ -176,11 +191,7 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
 
     @Override
     public String toString() {
-        try {
-            return new String(bytes, getCharStartOffset(), getUTF8Length(), StandardCharsets.UTF_8.name());
-        } catch (UnsupportedEncodingException e) {
-            throw new IllegalStateException(e);
-        }
+        return new String(bytes, getCharStartOffset(), getUTF8Length(), CESU8_CHARSET);
     }
 
     public int ignoreCaseCompareTo(UTF8StringPointable other) {
@@ -553,16 +564,11 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
         builder.finish();
     }
 
-    public void trim(UTF8StringBuilder builder, GrowableArray out, boolean left, boolean right, CharSet charSet)
-            throws IOException {
-        trim(this, builder, out, left, right, charSet);
-    }
-
     /**
      * Generates a trimmed string of an input source string.
      *
      * @param srcPtr
-     *            , the input source string.
+     *            , the input source string
      * @param builder
      *            , the result string builder.
      * @param out
@@ -571,23 +577,23 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
      *            , whether to trim the left side.
      * @param right
      *            , whether to trim the right side.
-     * @param charSet
-     *            , the chars that should be trimmed.
+     * @param codePointSet
+     *            , the set of code points that should be trimmed.
      * @throws IOException
      */
     public static void trim(UTF8StringPointable srcPtr, UTF8StringBuilder builder, GrowableArray out, boolean left,
-            boolean right, CharSet charSet) throws IOException {
+            boolean right, IntCollection codePointSet) throws IOException {
         final int srcUtfLen = srcPtr.getUTF8Length();
         final int srcStart = srcPtr.getMetaDataLength();
         // Finds the start Index (inclusive).
         int startIndex = 0;
         if (left) {
             while (startIndex < srcUtfLen) {
-                char ch = srcPtr.charAt(srcStart + startIndex);
-                if (!charSet.contains(ch)) {
+                int codepoint = srcPtr.codePointAt(srcStart + startIndex);
+                if (!codePointSet.contains(codepoint)) {
                     break;
                 }
-                startIndex += srcPtr.charSize(srcStart + startIndex);
+                startIndex += srcPtr.codePointSize(srcStart + startIndex);
             }
         }
 
@@ -597,9 +603,9 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
             endIndex = startIndex;
             int cursorIndex = startIndex;
             while (cursorIndex < srcUtfLen) {
-                char ch = srcPtr.charAt(srcStart + cursorIndex);
-                cursorIndex += srcPtr.charSize(srcStart + cursorIndex);
-                if (!charSet.contains(ch)) {
+                int codePioint = srcPtr.codePointAt(srcStart + cursorIndex);
+                cursorIndex += srcPtr.codePointSize(srcStart + cursorIndex);
+                if (!codePointSet.contains(codePioint)) {
                     endIndex = cursorIndex;
                 }
             }
@@ -613,6 +619,26 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
     }
 
     /**
+     * Generates a trimmed string from the original string.
+     *
+     * @param builder
+     *            , the result string builder.
+     * @param out
+     *            , the storage for the output string.
+     * @param left
+     *            , whether to trim the left side.
+     * @param right
+     *            , whether to trim the right side.
+     * @param codePointSet
+     *            , the set of code points that should be trimmed.
+     * @throws IOException
+     */
+    public void trim(UTF8StringBuilder builder, GrowableArray out, boolean left, boolean right,
+            IntCollection codePointSet) throws IOException {
+        trim(this, builder, out, left, right, codePointSet);
+    }
+
+    /**
      * Generates a reversed string from an input source string
      *
      * @param srcPtr
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
index fa93003..22be7ca 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java
@@ -27,14 +27,17 @@ import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
+import java.util.Arrays;
 
-import org.apache.commons.lang3.CharSet;
 import org.apache.hyracks.data.std.util.GrowableArray;
 import org.apache.hyracks.data.std.util.UTF8StringBuilder;
 import org.apache.hyracks.util.string.UTF8StringSample;
 import org.apache.hyracks.util.string.UTF8StringUtil;
 import org.junit.Test;
 
+import it.unimi.dsi.fastutil.ints.IntArraySet;
+import it.unimi.dsi.fastutil.ints.IntCollection;
+
 public class UTF8StringPointableTest {
     public static UTF8StringPointable STRING_EMPTY = generateUTF8Pointable(UTF8StringSample.EMPTY_STRING);
     public static UTF8StringPointable STRING_UTF8_MIX = generateUTF8Pointable(UTF8StringSample.STRING_UTF8_MIX);
@@ -229,26 +232,35 @@ public class UTF8StringPointableTest {
         GrowableArray storage = new GrowableArray();
         UTF8StringPointable result = new UTF8StringPointable();
         UTF8StringPointable input = generateUTF8Pointable("  this is it.i am;here.  ");
+        IntCollection spaceCodePointSet = new IntArraySet(Arrays.asList((int) ' '));
 
         // Trims both sides.
-        input.trim(builder, storage, true, true, CharSet.getInstance(" "));
+        input.trim(builder, storage, true, true, spaceCodePointSet);
         result.set(storage.getByteArray(), 0, storage.getLength());
         UTF8StringPointable expected = generateUTF8Pointable("this is it.i am;here.");
         assertEquals(0, expected.compareTo(result));
 
         // Only trims the right side.
         storage.reset();
-        input.trim(builder, storage, false, true, CharSet.getInstance(" "));
+        input.trim(builder, storage, false, true, spaceCodePointSet);
         result.set(storage.getByteArray(), 0, storage.getLength());
         expected = generateUTF8Pointable("  this is it.i am;here.");
         assertEquals(0, expected.compareTo(result));
 
         // Only trims the left side.
         storage.reset();
-        input.trim(builder, storage, true, false, CharSet.getInstance(" "));
+        input.trim(builder, storage, true, false, spaceCodePointSet);
         result.set(storage.getByteArray(), 0, storage.getLength());
         expected = generateUTF8Pointable("this is it.i am;here.  ");
         assertEquals(0, expected.compareTo(result));
+
+        // Only trims the left side in case of emoji
+        input = STRING_POINTABLE_EMOJI_FAMILY_OF_4;
+        storage.reset();
+        input.trim(builder, storage, true, false, spaceCodePointSet);
+        result.set(storage.getByteArray(), 0, storage.getLength());
+        expected = STRING_POINTABLE_EMOJI_FAMILY_OF_4;
+        assertEquals(0, expected.compareTo(result));
     }
 
     @Test
@@ -258,25 +270,59 @@ public class UTF8StringPointableTest {
         UTF8StringPointable result = new UTF8StringPointable();
         UTF8StringPointable input = generateUTF8Pointable("  this is it.i am;here.  ");
 
+        String pattern = " hert.";
+        UTF8StringPointable patternPointable = generateUTF8Pointable(pattern);
+        IntCollection codePointSet = new IntArraySet();
+        codePointSet.clear();
+        patternPointable.getCodePoints(codePointSet);
+
         // Trims both sides.
-        input.trim(builder, storage, true, true, CharSet.getInstance(" hert."));
+        input.trim(builder, storage, true, true, codePointSet);
         result.set(storage.getByteArray(), 0, storage.getLength());
         UTF8StringPointable expected = generateUTF8Pointable("is is it.i am;");
         assertEquals(0, expected.compareTo(result));
 
         // Only trims the right side.
         storage.reset();
-        input.trim(builder, storage, false, true, CharSet.getInstance(" hert."));
+        input.trim(builder, storage, false, true, codePointSet);
         result.set(storage.getByteArray(), 0, storage.getLength());
         expected = generateUTF8Pointable("  this is it.i am;");
         assertEquals(0, expected.compareTo(result));
 
         // Only trims the left side.
         storage.reset();
-        input.trim(builder, storage, true, false, CharSet.getInstance(" hert."));
+        input.trim(builder, storage, true, false, codePointSet);
         result.set(storage.getByteArray(), 0, storage.getLength());
         expected = generateUTF8Pointable("is is it.i am;here.  ");
         assertEquals(0, expected.compareTo(result));
+
+        // Test Emoji trim
+        input = STRING_POINTABLE_EMOJI_FAMILY_OF_4;
+        pattern = "πŸ‘¨πŸ‘¦";
+        patternPointable = generateUTF8Pointable(pattern);
+        codePointSet.clear();
+        patternPointable.getCodePoints(codePointSet);
+
+        // Trim left
+        storage.reset();
+        input.trim(builder, storage, true, false, codePointSet);
+        result.set(storage.getByteArray(), 0, storage.getLength());
+        expected = generateUTF8Pointable("\u200D" + "πŸ‘¨β€πŸ‘¦β€πŸ‘¦");
+        assertEquals(0, expected.compareTo(result));
+
+        // Trim right
+        storage.reset();
+        input.trim(builder, storage, false, true, codePointSet);
+        result.set(storage.getByteArray(), 0, storage.getLength());
+        expected = generateUTF8Pointable("πŸ‘¨β€πŸ‘¨β€πŸ‘¦" + "\u200D");
+        assertEquals(0, expected.compareTo(result));
+
+        // Trim left and right
+        storage.reset();
+        input.trim(builder, storage, true, true, codePointSet);
+        result.set(storage.getByteArray(), 0, storage.getLength());
+        expected = generateUTF8Pointable("\u200D" + "πŸ‘¨β€πŸ‘¦" + "\u200D");
+        assertEquals(0, expected.compareTo(result));
     }
 
 }
diff --git a/hyracks-fullstack/pom.xml b/hyracks-fullstack/pom.xml
index fb801b9..d7b6829 100644
--- a/hyracks-fullstack/pom.xml
+++ b/hyracks-fullstack/pom.xml
@@ -283,6 +283,11 @@
         <artifactId>maven-plugin-api</artifactId>
         <version>3.6.3</version>
       </dependency>
+      <dependency>
+        <groupId>it.unimi.dsi</groupId>
+        <artifactId>fastutil</artifactId>
+        <version>8.3.0</version>
+      </dependency>
     </dependencies>
   </dependencyManagement>
   <build>