You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by ji...@apache.org on 2015/10/29 01:25:09 UTC

[7/7] incubator-asterixdb-hyracks git commit: ASTERIXDB-1102: VarSize Encoding to store length of String and ByteArray

ASTERIXDB-1102: VarSize Encoding to store length of String and ByteArray

This patch is to change the encoding format that stores the length value of
the variable length type (e.g. String, ByteArray) from fix-size encoding
(2bytes) to variable-size encoding ( 1 to 5bytes)

It will solve the issue 1102 to enable us to store a String that longer
than 64K. Also for the common case of storing the short string ( <=
127), it will save one byte per string.

Some important changes include:
1. Add one hyracks-util package to consolidate all the hyracks
independent utility functions. It will reduce the chances of having
duplicate utils in different packages.
2. Move parts of Asterix string functions down to Hyracks
UTF8StringPointable object, which will benefit the other dependencies,
such as VXQuery.

Change-Id: I7e95df0f06984b784ebac2c84b97e56a50207d27
Reviewed-on: https://asterix-gerrit.ics.uci.edu/449
Tested-by: Jenkins <je...@fulliautomatix.ics.uci.edu>
Reviewed-by: Taewoo Kim <wa...@gmail.com>
Reviewed-by: Jianfeng Jia <ji...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/commit/26c3b536
Tree: http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/tree/26c3b536
Diff: http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/diff/26c3b536

Branch: refs/heads/master
Commit: 26c3b5361db6b2a3816ec6818e1be4d0b1ce1b3d
Parents: 492b6fe
Author: JavierJia <ji...@gmail.com>
Authored: Fri Oct 23 13:49:32 2015 -0700
Committer: Jianfeng Jia <ji...@gmail.com>
Committed: Wed Oct 28 17:20:56 2015 -0700

----------------------------------------------------------------------
 .../data/impl/UTF8StringPrinterFactory.java     |  22 +-
 .../algebricks/data/utils/WriteValueTools.java  |  53 +--
 .../algebricks-examples/piglet-example/pom.xml  |   5 +
 .../compiler/PigletPrinterFactoryProvider.java  |   4 +-
 .../piglet/metadata/PigletMetadataProvider.java |   2 +-
 .../piglet/runtime/PigletExpressionJobGen.java  |   4 +-
 algebricks/algebricks-examples/pom.xml          |   9 +-
 .../tests/pushruntime/PushRuntimeTest.java      |  46 +-
 .../algebricks/tests/tools/WriteValueTest.java  |   2 +-
 hyracks/hyracks-data/hyracks-data-std/pom.xml   |  46 +-
 .../UTF8StringBinaryHashFunctionFamily.java     |  14 +-
 .../hyracks/data/std/api/AbstractPointable.java |   8 +
 .../data/std/primitive/ByteArrayPointable.java  | 135 ++++--
 .../std/primitive/RawUTF8StringPointable.java   |  37 +-
 .../primitive/UTF8StringLowercasePointable.java |  73 +++
 .../data/std/primitive/UTF8StringPointable.java | 449 ++++++++++++++-----
 .../data/std/primitive/UTF8StringWriter.java    |  79 ----
 .../std/util/AbstractVarLenObjectBuilder.java   |  96 ++++
 .../util/ByteArrayAccessibleOutputStream.java   |  25 +-
 .../hyracks/data/std/util/ByteArrayBuilder.java |  33 ++
 .../hyracks/data/std/util/GrowableArray.java    |  21 +-
 .../hyracks/data/std/util/ICharIterator.java    |  27 ++
 .../std/util/RewindableDataOutputStream.java    |  56 +++
 .../hyracks/data/std/util/UTF8CharSequence.java |  71 +++
 .../data/std/util/UTF8StringBuilder.java        |  43 ++
 .../std/util/UTF8StringCharacterIterator.java   |  50 +++
 .../std/primitive/ByteArrayPointableTest.java   |  48 +-
 .../std/primitive/UTF8StringPointableTest.java  | 193 ++++++++
 .../data/std/util/UTF8StringBuilderTest.java    |  81 ++++
 .../util/UTF8StringCharacterIteratorTest.java   |  58 +++
 hyracks/hyracks-dataflow-common/pom.xml         |  95 ++--
 .../ByteArraySerializerDeserializer.java        |  56 ++-
 .../UTF8StringSerializerDeserializer.java       |  12 +-
 .../ByteArrayNormalizedKeyComputerFactory.java  |  21 +-
 .../UTF8StringNormalizedKeyComputerFactory.java |  14 +-
 .../parsers/ByteArrayBase64ParserFactory.java   | 217 +--------
 .../data/parsers/ByteArrayHexParserFactory.java | 102 +----
 .../data/parsers/UTF8StringParserFactory.java   |  46 +-
 .../dataflow/common/data/util/StringUtils.java  |  47 --
 .../FrameFixedFieldTupleAppenderTest.java       |   4 +-
 .../ByteArraySerializerDeserializerTest.java    |  56 +--
 ...teArrayNormalizedKeyComputerFactoryTest.java |  42 +-
 .../ByteArrayBase64ParserFactoryTest.java       |  58 ++-
 .../parsers/ByteArrayHexParserFactoryTest.java  |  60 ++-
 .../MinMaxStringFieldAggregatorFactory.java     |   7 +-
 .../VariableTupleMemoryManagerTest.java         |   2 +-
 .../util/DeletableFrameTupleAppenderTest.java   |  13 +-
 .../btree/client/InsertPipelineExample.java     |   6 +-
 .../client/PrimaryIndexBulkLoadExample.java     |   6 +-
 .../btree/client/PrimaryIndexSearchExample.java |   8 +-
 .../client/SecondaryIndexBulkLoadExample.java   |   4 +-
 .../client/SecondaryIndexSearchExample.java     |  14 +-
 .../am/btree/AbstractBTreeOperatorTest.java     |  34 +-
 .../BTreePrimaryIndexScanOperatorTest.java      |   6 +-
 .../BTreePrimaryIndexSearchOperatorTest.java    |   8 +-
 .../BTreeSecondaryIndexInsertOperatorTest.java  |   8 +-
 .../BTreeSecondaryIndexSearchOperatorTest.java  |   8 +-
 .../BTreeSecondaryIndexUpsertOperatorTest.java  |   8 +-
 .../am/rtree/AbstractRTreeOperatorTest.java     |  34 +-
 .../comm/SerializationDeserializationTest.java  |   2 +-
 .../tests/integration/AggregationTest.java      |  58 +--
 .../tests/integration/CountOfCountsTest.java    |  12 +-
 .../integration/LocalityAwareConnectorTest.java |  14 +-
 .../integration/OptimizedSortMergeTest.java     |  20 +-
 .../tests/integration/ScanPrintTest.java        |  20 +-
 .../tests/integration/SortMergeTest.java        |  20 +-
 .../tests/integration/SplitOperatorTest.java    |   2 +-
 ...TPCHCustomerOptimizedHybridHashJoinTest.java | 108 ++---
 .../TPCHCustomerOrderHashJoinTest.java          | 396 ++++++++--------
 .../TPCHCustomerOrderNestedLoopJoinTest.java    | 144 +++---
 .../hyracks/tests/integration/UnionTest.java    |   2 +-
 .../integration/VSizeFrameSortMergeTest.java    |  10 +-
 .../tests/unit/AbstractRunGeneratorTest.java    |   8 +-
 .../tests/unit/RunMergingFrameReaderTest.java   |   6 +-
 .../text/client/ExternalGroupClient.java        |   8 +-
 .../examples/text/client/WordCountMain.java     |   4 +-
 .../hyracks/examples/tpch/client/Common.java    |  36 +-
 .../hyracks/examples/tpch/client/Join.java      |   2 +-
 .../hyracks/hdfs/dataflow/DataflowTest.java     |   2 +-
 .../hyracks/hdfs2/dataflow/DataflowTest.java    |   2 +-
 hyracks/hyracks-storage-am-common/pom.xml       |   5 +
 .../TreeIndexStatsOperatorDescriptor.java       |   2 +-
 .../TreeIndexStatsOperatorNodePushable.java     |   3 +-
 .../common/tuples/TypeAwareTupleReference.java  |   5 +-
 .../am/common/tuples/TypeAwareTupleWriter.java  |  15 +-
 .../common/tuples/VarLenIntEncoderDecoder.java  |  92 ----
 .../pom.xml                                     |  73 +--
 .../search/AbstractTOccurrenceSearcher.java     |   2 +-
 .../AbstractUTF8StringBinaryTokenizer.java      |  45 +-
 .../tokenizers/AbstractUTF8Token.java           |  78 +++-
 .../DelimitedUTF8StringBinaryTokenizer.java     |  61 ++-
 .../tokenizers/HashedUTF8NGramToken.java        |   8 +-
 .../tokenizers/HashedUTF8WordToken.java         |  18 +-
 .../am/lsm/invertedindex/tokenizers/IToken.java |  24 +-
 .../NGramUTF8StringBinaryTokenizer.java         |  30 +-
 .../tokenizers/UTF8NGramToken.java              |  38 +-
 .../invertedindex/tokenizers/UTF8WordToken.java |  22 +-
 .../rtree/tuples/RTreeTypeAwareTupleWriter.java |   5 +-
 .../am/btree/OrderedIndexExamplesTest.java      |  16 +-
 .../am/btree/OrderedIndexMultiThreadTest.java   |   2 +-
 .../am/btree/OrderedIndexTestDriver.java        |  14 +-
 .../am/rtree/AbstractRTreeExamplesTest.java     |   4 +-
 .../storage/am/bloomfilter/BloomFilterTest.java |   6 +-
 .../MurmurHashForITupleReferenceTest.java       |   6 +-
 .../am/lsm/btree/tuples/LSMBTreeTuplesTest.java |  10 +-
 .../pom.xml                                     |  75 ++--
 .../tokenizers/NGramTokenizerTest.java          |  17 +-
 .../tokenizers/WordTokenizerTest.java           |  27 +-
 .../util/LSMInvertedIndexTestUtils.java         |   8 +-
 hyracks/hyracks-util/pom.xml                    |  58 +++
 .../apache/hyracks/util/bytes/Base64Parser.java | 250 +++++++++++
 .../hyracks/util/bytes/Base64Printer.java       | 125 ++++++
 .../apache/hyracks/util/bytes/HexParser.java    |  97 ++++
 .../apache/hyracks/util/bytes/HexPrinter.java   |  48 ++
 .../util/encoding/VarLenIntEncoderDecoder.java  | 145 ++++++
 .../hyracks/util/string/UTF8StringReader.java   | 131 ++++++
 .../hyracks/util/string/UTF8StringUtil.java     | 422 +++++++++++++++++
 .../hyracks/util/string/UTF8StringWriter.java   | 113 +++++
 .../encoding/VarLenIntEncoderDecoderTest.java   |  87 ++++
 .../util/string/UTF8StringReaderWriterTest.java |  90 ++++
 .../hyracks/util/string/UTF8StringSample.java   |  56 +++
 .../hyracks/util/string/UTF8StringUtilTest.java | 144 ++++++
 hyracks/pom.xml                                 |   1 +
 pom.xml                                         | 158 +++----
 124 files changed, 4310 insertions(+), 1988 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/impl/UTF8StringPrinterFactory.java
----------------------------------------------------------------------
diff --git a/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/impl/UTF8StringPrinterFactory.java b/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/impl/UTF8StringPrinterFactory.java
index 8aa646e..1aa3370 100644
--- a/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/impl/UTF8StringPrinterFactory.java
+++ b/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/impl/UTF8StringPrinterFactory.java
@@ -18,12 +18,13 @@
  */
 package org.apache.hyracks.algebricks.data.impl;
 
+import java.io.IOException;
 import java.io.PrintStream;
 
 import org.apache.hyracks.algebricks.common.exceptions.AlgebricksException;
 import org.apache.hyracks.algebricks.data.IPrinter;
 import org.apache.hyracks.algebricks.data.IPrinterFactory;
-import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.apache.hyracks.util.string.UTF8StringUtil;
 
 public class UTF8StringPrinterFactory implements IPrinterFactory {
 
@@ -40,22 +41,11 @@ public class UTF8StringPrinterFactory implements IPrinterFactory {
 
             @Override
             public void print(byte[] b, int s, int l, PrintStream ps) throws AlgebricksException {
-                int strlen = UTF8StringPointable.getUTFLength(b, s);
-                int pos = s + 2;
-                int maxPos = pos + strlen;
-                ps.print("\"");
-                while (pos < maxPos) {
-                    char c = UTF8StringPointable.charAt(b, pos);
-                    switch (c) {
-                        case '\\':
-                        case '"':
-                            ps.print('\\');
-                            break;
-                    }
-                    ps.print(c);
-                    pos += UTF8StringPointable.charSize(b, pos);
+                try {
+                    UTF8StringUtil.printUTF8StringWithQuotes(b, s, l, ps);
+                } catch (IOException e) {
+                    throw new AlgebricksException(e);
                 }
-                ps.print("\"");
             }
 
             @Override

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/utils/WriteValueTools.java
----------------------------------------------------------------------
diff --git a/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/utils/WriteValueTools.java b/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/utils/WriteValueTools.java
index 8a96ea6..97e7d95 100644
--- a/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/utils/WriteValueTools.java
+++ b/algebricks/algebricks-data/src/main/java/org/apache/hyracks/algebricks/data/utils/WriteValueTools.java
@@ -20,14 +20,16 @@ package org.apache.hyracks.algebricks.data.utils;
 
 import java.io.IOException;
 import java.io.OutputStream;
+import java.io.PrintStream;
 
-import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.apache.hyracks.util.string.UTF8StringUtil;
 
 public final class WriteValueTools {
 
     private final static int[] INT_INTERVALS = { 9, 99, 999, 9999, 99999, 999999, 9999999, 99999999, 999999999,
             Integer.MAX_VALUE };
-    private final static int[] INT_DIVIDERS = { 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 };
+    private final static int[] INT_DIVIDERS = { 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000,
+            1000000000 };
     private final static int[] DIGITS = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' };
 
     public static void writeInt(int i, OutputStream os) throws IOException {
@@ -75,50 +77,11 @@ public final class WriteValueTools {
         os.write(DIGITS[(int) (d % 10)]);
     }
 
-    public static void writeUTF8String(byte[] b, int s, int l, OutputStream os) throws IOException {
-        int stringLength = UTF8StringPointable.getUTFLength(b, s);
-        int position = s + 2;
-        int maxPosition = position + stringLength;
-        os.write('\"');
-        while (position < maxPosition) {
-            char c = UTF8StringPointable.charAt(b, position);
-            switch (c) {
-            // escape
-                case '\\':
-                case '"':
-                    os.write('\\');
-                    break;
-            }
-            int sz = UTF8StringPointable.charSize(b, position);
-            while (sz > 0) {
-                os.write(b[position]);
-                position++;
-                sz--;
-            }
-        }
-        os.write('\"');
+    public static void writeUTF8StringWithQuotes(String string, OutputStream ps) throws IOException {
+        UTF8StringUtil.printUTF8StringWithQuotes(string, ps);
     }
 
-    public static void writeUTF8StringNoQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
-        int stringLength = UTF8StringPointable.getUTFLength(b, s);
-        int position = s + 2;
-        int maxPosition = position + stringLength;
-        while (position < maxPosition) {
-            char c = UTF8StringPointable.charAt(b, position);
-            switch (c) {
-            // escape
-                case '\\':
-                case '"':
-                    os.write('\\');
-                    break;
-            }
-            int sz = UTF8StringPointable.charSize(b, position);
-            while (sz > 0) {
-                os.write(b[position]);
-                position++;
-                sz--;
-            }
-        }
+    public static void writeUTF8StringNoQuotes(String string, OutputStream ps) throws IOException {
+        UTF8StringUtil.printUTF8StringNoQuotes(string, ps);
     }
-
 }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/algebricks/algebricks-examples/piglet-example/pom.xml
----------------------------------------------------------------------
diff --git a/algebricks/algebricks-examples/piglet-example/pom.xml b/algebricks/algebricks-examples/piglet-example/pom.xml
index a037db5..ae2ec51 100644
--- a/algebricks/algebricks-examples/piglet-example/pom.xml
+++ b/algebricks/algebricks-examples/piglet-example/pom.xml
@@ -111,5 +111,10 @@
       <artifactId>algebricks-compiler</artifactId>
       <version>0.2.17-SNAPSHOT</version>
     </dependency>
+      <dependency>
+          <groupId>org.apache.hyracks</groupId>
+          <artifactId>hyracks-util</artifactId>
+          <version>0.2.17-SNAPSHOT</version>
+      </dependency>
   </dependencies>
 </project>

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/compiler/PigletPrinterFactoryProvider.java
----------------------------------------------------------------------
diff --git a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/compiler/PigletPrinterFactoryProvider.java b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/compiler/PigletPrinterFactoryProvider.java
index 6d64741..8049594 100644
--- a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/compiler/PigletPrinterFactoryProvider.java
+++ b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/compiler/PigletPrinterFactoryProvider.java
@@ -29,7 +29,9 @@ import org.apache.hyracks.algebricks.data.impl.IntegerPrinterFactory;
 import org.apache.hyracks.algebricks.data.utils.WriteValueTools;
 import org.apache.hyracks.algebricks.examples.piglet.types.Type;
 import org.apache.hyracks.data.std.primitive.FloatPointable;
+import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
 import org.apache.hyracks.dataflow.common.data.marshalling.FloatSerializerDeserializer;
+import org.apache.hyracks.util.string.UTF8StringUtil;
 
 public class PigletPrinterFactoryProvider implements IPrinterFactoryProvider {
 
@@ -73,7 +75,7 @@ public class PigletPrinterFactoryProvider implements IPrinterFactoryProvider {
                 @Override
                 public void print(byte[] b, int s, int l, PrintStream ps) throws AlgebricksException {
                     try {
-                        WriteValueTools.writeUTF8String(b, s, l, ps);
+                        UTF8StringUtil.printUTF8StringWithQuotes(b, s, l, ps);
                     } catch (IOException e) {
                         throw new AlgebricksException(e);
                     }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/metadata/PigletMetadataProvider.java
----------------------------------------------------------------------
diff --git a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/metadata/PigletMetadataProvider.java b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/metadata/PigletMetadataProvider.java
index 7d9b3db..8f9ab9f 100644
--- a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/metadata/PigletMetadataProvider.java
+++ b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/metadata/PigletMetadataProvider.java
@@ -110,7 +110,7 @@ public class PigletMetadataProvider implements IMetadataProvider<String, String>
 
                 case CHAR_ARRAY:
                     vpf = UTF8StringParserFactory.INSTANCE;
-                    serDeser = UTF8StringSerializerDeserializer.INSTANCE;
+                    serDeser = new UTF8StringSerializerDeserializer();
                     break;
 
                 case FLOAT:

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/runtime/PigletExpressionJobGen.java
----------------------------------------------------------------------
diff --git a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/runtime/PigletExpressionJobGen.java b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/runtime/PigletExpressionJobGen.java
index 6c173b2..1c3f9b8 100644
--- a/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/runtime/PigletExpressionJobGen.java
+++ b/algebricks/algebricks-examples/piglet-example/src/main/java/org/apache/hyracks/algebricks/examples/piglet/runtime/PigletExpressionJobGen.java
@@ -53,6 +53,8 @@ import org.apache.hyracks.dataflow.common.data.marshalling.IntegerSerializerDese
 import org.apache.hyracks.dataflow.common.data.marshalling.UTF8StringSerializerDeserializer;
 
 public class PigletExpressionJobGen implements ILogicalExpressionJobGen {
+    private final UTF8StringSerializerDeserializer utf8SerDer = new UTF8StringSerializerDeserializer();
+
     @Override
     public ICopyEvaluatorFactory createEvaluatorFactory(ILogicalExpression expr, IVariableTypeEnvironment env,
             IOperatorSchema[] inputSchemas, JobGenContext context) throws AlgebricksException {
@@ -74,7 +76,7 @@ public class PigletExpressionJobGen implements ILogicalExpressionJobGen {
 
                     case CHAR_ARRAY:
                         try {
-                            UTF8StringSerializerDeserializer.INSTANCE.serialize(image, dos);
+                            utf8SerDer.serialize(image, dos);
                         } catch (Exception e) {
                             throw new AlgebricksException(e);
                         }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/algebricks/algebricks-examples/pom.xml
----------------------------------------------------------------------
diff --git a/algebricks/algebricks-examples/pom.xml b/algebricks/algebricks-examples/pom.xml
index 7ba1b5b..968db33 100644
--- a/algebricks/algebricks-examples/pom.xml
+++ b/algebricks/algebricks-examples/pom.xml
@@ -22,8 +22,15 @@
   <artifactId>algebricks-examples</artifactId>
   <packaging>pom</packaging>
   <name>algebricks-examples</name>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.hyracks</groupId>
+            <artifactId>algebricks-core</artifactId>
+            <version>0.2.17-SNAPSHOT</version>
+        </dependency>
+    </dependencies>
 
-  <parent>
+    <parent>
     <groupId>org.apache.hyracks</groupId>
     <artifactId>algebricks</artifactId>
     <version>0.2.17-SNAPSHOT</version>

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/pushruntime/PushRuntimeTest.java
----------------------------------------------------------------------
diff --git a/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/pushruntime/PushRuntimeTest.java b/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/pushruntime/PushRuntimeTest.java
index 3c97878..7fcab17 100644
--- a/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/pushruntime/PushRuntimeTest.java
+++ b/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/pushruntime/PushRuntimeTest.java
@@ -275,10 +275,10 @@ public class PushRuntimeTest {
         IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
 
         RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
-                IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, FloatSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+                IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+                new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE,
+                new UTF8StringSerializerDeserializer(), FloatSerializerDeserializer.INSTANCE,
+                new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() });
         IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
                 UTF8StringParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE,
                 UTF8StringParserFactory.INSTANCE, FloatParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE,
@@ -355,10 +355,10 @@ public class PushRuntimeTest {
                 "data/tpch0.001/customer-part1.tbl")));
         IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
         RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
-                IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, FloatSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+                IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+                new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE,
+                new UTF8StringSerializerDeserializer(), FloatSerializerDeserializer.INSTANCE,
+                new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() });
         IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
                 UTF8StringParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE,
                 UTF8StringParserFactory.INSTANCE, FloatParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE,
@@ -407,10 +407,10 @@ public class PushRuntimeTest {
                 "data/tpch0.001/customer.tbl")));
         IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
         RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
-                IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, FloatSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+                IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+                new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE,
+                new UTF8StringSerializerDeserializer(), FloatSerializerDeserializer.INSTANCE,
+                new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() });
         IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
                 UTF8StringParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE,
                 UTF8StringParserFactory.INSTANCE, FloatParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE,
@@ -492,10 +492,10 @@ public class PushRuntimeTest {
                 "data/tpch0.001/customer.tbl")));
         IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
         RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
-                IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, FloatSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+                IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+                new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE,
+                new UTF8StringSerializerDeserializer(), FloatSerializerDeserializer.INSTANCE,
+                new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() });
         IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
                 UTF8StringParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE,
                 UTF8StringParserFactory.INSTANCE, FloatParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE,
@@ -663,7 +663,7 @@ public class PushRuntimeTest {
         DelimitedDataTupleParserFactory stringParser = new DelimitedDataTupleParserFactory(
                 new IValueParserFactory[] { UTF8StringParserFactory.INSTANCE }, '\u0000');
         RecordDescriptor stringRec = new RecordDescriptor(
-                new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE, });
+                new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer(), });
 
         FileScanOperatorDescriptor scanOp = new FileScanOperatorDescriptor(spec, new ConstantFileSplitProvider(
                 inputSplits), stringParser, stringRec);
@@ -709,8 +709,8 @@ public class PushRuntimeTest {
                 "data/tpch0.001/nation.tbl")));
         IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
         RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
-                IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
-                IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+                IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+                IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer() });
         IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
                 UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE };
         FileScanOperatorDescriptor scanner = new FileScanOperatorDescriptor(spec, splitProvider,
@@ -817,10 +817,10 @@ public class PushRuntimeTest {
                 "data/tpch0.001/customer.tbl")));
         IFileSplitProvider splitProvider = new ConstantFileSplitProvider(fileSplits);
         RecordDescriptor scannerDesc = new RecordDescriptor(new ISerializerDeserializer[] {
-                IntegerSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, FloatSerializerDeserializer.INSTANCE,
-                UTF8StringSerializerDeserializer.INSTANCE, UTF8StringSerializerDeserializer.INSTANCE });
+                IntegerSerializerDeserializer.INSTANCE, new UTF8StringSerializerDeserializer(),
+                new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE,
+                new UTF8StringSerializerDeserializer(), FloatSerializerDeserializer.INSTANCE,
+                new UTF8StringSerializerDeserializer(), new UTF8StringSerializerDeserializer() });
         IValueParserFactory[] valueParsers = new IValueParserFactory[] { IntegerParserFactory.INSTANCE,
                 UTF8StringParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE, IntegerParserFactory.INSTANCE,
                 UTF8StringParserFactory.INSTANCE, FloatParserFactory.INSTANCE, UTF8StringParserFactory.INSTANCE,

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/tools/WriteValueTest.java
----------------------------------------------------------------------
diff --git a/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/tools/WriteValueTest.java b/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/tools/WriteValueTest.java
index 0968478..6770494 100644
--- a/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/tools/WriteValueTest.java
+++ b/algebricks/algebricks-tests/src/test/java/org/apache/hyracks/algebricks/tests/tools/WriteValueTest.java
@@ -97,7 +97,7 @@ public class WriteValueTest {
         interm.reset();
         dout.writeUTF(str);
         baaos.reset();
-        WriteValueTools.writeUTF8String(interm.getByteArray(), 0, interm.size(), baaos);
+        WriteValueTools.writeUTF8StringWithQuotes(str, baaos);
         byte[] b = str.getBytes("UTF-8");
         if (baaos.size() != b.length + 2) {
             throw new Exception("Expecting to write " + b + " in " + b.length + " bytes, but found " + baaos.size()

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/pom.xml
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/pom.xml b/hyracks/hyracks-data/hyracks-data-std/pom.xml
index 8546bdb..20c30ef 100644
--- a/hyracks/hyracks-data/hyracks-data-std/pom.xml
+++ b/hyracks/hyracks-data/hyracks-data-std/pom.xml
@@ -17,23 +17,35 @@
  ! under the License.
  !-->
 
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <artifactId>hyracks-data-std</artifactId>
-  <name>hyracks-data-std</name>
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <artifactId>hyracks-data-std</artifactId>
+    <name>hyracks-data-std</name>
 
-  <parent>
-    <groupId>org.apache.hyracks</groupId>
-    <artifactId>hyracks-data</artifactId>
-    <version>0.2.17-SNAPSHOT</version>
-  </parent>
+    <parent>
+        <groupId>org.apache.hyracks</groupId>
+        <artifactId>hyracks-data</artifactId>
+        <version>0.2.17-SNAPSHOT</version>
+    </parent>
 
-
-  <dependencies>
-  <dependency>
-  	<groupId>org.apache.hyracks</groupId>
-  	<artifactId>hyracks-api</artifactId>
-  	<version>0.2.17-SNAPSHOT</version>
-  </dependency>
-  </dependencies>
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.hyracks</groupId>
+            <artifactId>hyracks-util</artifactId>
+            <version>0.2.17-SNAPSHOT</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hyracks</groupId>
+            <artifactId>hyracks-api</artifactId>
+            <version>0.2.17-SNAPSHOT</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.hyracks</groupId>
+            <artifactId>hyracks-util</artifactId>
+            <version>0.2.17-SNAPSHOT</version>
+            <type>test-jar</type>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
 </project>

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/accessors/UTF8StringBinaryHashFunctionFamily.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/accessors/UTF8StringBinaryHashFunctionFamily.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/accessors/UTF8StringBinaryHashFunctionFamily.java
index affafea..ea661e3 100644
--- a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/accessors/UTF8StringBinaryHashFunctionFamily.java
+++ b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/accessors/UTF8StringBinaryHashFunctionFamily.java
@@ -20,7 +20,7 @@ package org.apache.hyracks.data.std.accessors;
 
 import org.apache.hyracks.api.dataflow.value.IBinaryHashFunction;
 import org.apache.hyracks.api.dataflow.value.IBinaryHashFunctionFamily;
-import org.apache.hyracks.data.std.primitive.UTF8StringPointable;
+import org.apache.hyracks.util.string.UTF8StringUtil;
 
 public class UTF8StringBinaryHashFunctionFamily implements IBinaryHashFunctionFamily {
     public static final IBinaryHashFunctionFamily INSTANCE = new UTF8StringBinaryHashFunctionFamily();
@@ -40,17 +40,7 @@ public class UTF8StringBinaryHashFunctionFamily implements IBinaryHashFunctionFa
         return new IBinaryHashFunction() {
             @Override
             public int hash(byte[] bytes, int offset, int length) {
-                int h = 0;
-                int utflen = UTF8StringPointable.getUTFLength(bytes, offset);
-                int sStart = offset + 2;
-                int c = 0;
-
-                while (c < utflen) {
-                    char ch = UTF8StringPointable.charAt(bytes, sStart + c);
-                    h = (coefficient * h + ch) % r;
-                    c += UTF8StringPointable.charSize(bytes, sStart + c);
-                }
-                return h;
+                return UTF8StringUtil.hash(bytes, offset, coefficient, r);
             }
         };
     }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/api/AbstractPointable.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/api/AbstractPointable.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/api/AbstractPointable.java
index a10b0da..549a136 100644
--- a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/api/AbstractPointable.java
+++ b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/api/AbstractPointable.java
@@ -30,6 +30,7 @@ public abstract class AbstractPointable implements IPointable {
         this.bytes = bytes;
         this.start = start;
         this.length = length;
+        afterReset();
     }
 
     @Override
@@ -37,6 +38,13 @@ public abstract class AbstractPointable implements IPointable {
         set(pointer.getByteArray(), pointer.getStartOffset(), pointer.getLength());
     }
 
+    /**
+     * This method will be called after set the new bytes values.
+     * It could be used to reset the state of the inherited Pointable object.
+     */
+    protected void afterReset() {
+    }
+
     @Override
     public byte[] getByteArray() {
         return bytes;

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/ByteArrayPointable.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/ByteArrayPointable.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/ByteArrayPointable.java
index 0a2a723..af54c7e 100644
--- a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/ByteArrayPointable.java
+++ b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/ByteArrayPointable.java
@@ -19,10 +19,33 @@
 
 package org.apache.hyracks.data.std.primitive;
 
+import java.io.Serializable;
+import java.util.Arrays;
+
 import org.apache.hyracks.api.dataflow.value.ITypeTraits;
-import org.apache.hyracks.data.std.api.*;
+import org.apache.hyracks.data.std.api.AbstractPointable;
+import org.apache.hyracks.data.std.api.IComparable;
+import org.apache.hyracks.data.std.api.IHashable;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.api.IPointableFactory;
+import org.apache.hyracks.data.std.api.IValueReference;
+import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+
+public class ByteArrayPointable extends AbstractPointable implements IHashable, IComparable, Serializable {
+
+    // These three values are cached to speed up the length data access.
+    // Since the we are using the variable-length encoding, we can save the repeated decoding efforts.
+    // WARNING: must call the resetConstants() method after each reset().
+    private int contentLength = -1;
+    private int metaLength = -1;
+    private int hash = 0;
 
-public class ByteArrayPointable extends AbstractPointable implements IHashable, IComparable {
+    @Override
+    protected void afterReset() {
+        contentLength = getContentLength(getByteArray(), getStartOffset());
+        metaLength = getNumberBytesToStoreMeta(contentLength);
+        hash = 0;
+    }
 
     public static final ITypeTraits TYPE_TRAITS = new ITypeTraits() {
         private static final long serialVersionUID = 1L;
@@ -58,48 +81,106 @@ public class ByteArrayPointable extends AbstractPointable implements IHashable,
     }
 
     @Override
-    public int compareTo(byte[] bytes, int start, int length) {
-        int thislen = getLength(this.bytes, this.start);
-        int thatlen = getLength(bytes, start);
-
-        for (int thisIndex = 0, thatIndex = 0; thisIndex < thislen && thatIndex < thatlen; ++thisIndex, ++thatIndex) {
-            if (this.bytes[this.start + SIZE_OF_LENGTH + thisIndex] != bytes[start + SIZE_OF_LENGTH + thatIndex]) {
-                return (0xff & this.bytes[this.start + SIZE_OF_LENGTH + thisIndex]) - (0xff & bytes[start + SIZE_OF_LENGTH
-                        + thatIndex]);
+    public int compareTo(byte[] thatBytes, int thatStart, int thatLength) {
+        int thisArrayLen = getContentLength(this.bytes, this.start);
+        int thatArrayLen = getContentLength(thatBytes, thatStart);
+
+        int thisArrayStart = this.getContentStartOffset();
+        int thatArrayStart = thatStart + getNumberBytesToStoreMeta(thatArrayLen);
+
+        for (int thisIndex = 0, thatIndex = 0;
+             thisIndex < thisArrayLen && thatIndex < thatArrayLen; ++thisIndex, ++thatIndex) {
+            if (this.bytes[thisArrayStart + thisIndex] != thatBytes[thatArrayStart + thatIndex]) {
+                return (0xff & this.bytes[thisArrayStart + thisIndex]) - (0xff & thatBytes[thatArrayStart + thatIndex]);
             }
         }
-        return thislen - thatlen;
+        return thisArrayLen - thatArrayLen;
+    }
+
+    public int getContentLength() {
+        return contentLength;
+    }
+
+    public int getMetaLength() {
+        return metaLength;
     }
 
     @Override
     public int hash() {
-        int h = 0;
-        int realLength = getLength(bytes, start);
-        for (int i = 0; i < realLength; ++i) {
-            h = 31 * h + bytes[start + SIZE_OF_LENGTH + i];
+        if (hash == 0) {
+            int h = 0;
+            int realLength = getContentLength();
+            int startOffset = getContentStartOffset();
+            for (int i = 0; i < realLength; ++i) {
+                h = 31 * h + bytes[startOffset + i];
+            }
+            hash = h;
         }
-        return h;
+        return hash;
     }
 
     @Override
-    public int getLength(){
-        return getFullLength(getByteArray(), getStartOffset());
+    public int getLength() {
+        return getContentLength() + getMetaLength();
     }
 
-    public static final int SIZE_OF_LENGTH = 2;
-    public static final int MAX_LENGTH = 65535;
+    public int getContentStartOffset() {
+        return getStartOffset() + getMetaLength();
+    }
+
+    ///////////////// helper functions ////////////////////////////////
+    public static byte[] copyContent(ByteArrayPointable bytePtr) {
+        return Arrays.copyOfRange(bytePtr.getByteArray(), bytePtr.getContentStartOffset(),
+                bytePtr.getContentStartOffset() + bytePtr.getContentLength());
+    }
+
+    public static ByteArrayPointable generatePointableFromPureBytes(byte[] bytes) {
+        return generatePointableFromPureBytes(bytes, 0, bytes.length);
+    }
 
-    public static int getLength(byte[] bytes, int offset) {
-        return ((0xFF & bytes[offset]) << 8) + (0xFF & bytes[offset + 1]);
+    public static ByteArrayPointable generatePointableFromPureBytes(byte[] bytes, int start, int length) {
+        int metaLen = getNumberBytesToStoreMeta(length);
+        byte[] ret = new byte[length + metaLen];
+        VarLenIntEncoderDecoder.encode(length, ret, 0);
+        for (int i = 0; i < length; ++i) {
+            ret[i + metaLen] = bytes[start + i];
+        }
+        ByteArrayPointable ptr = new ByteArrayPointable();
+        ptr.set(ret, 0, ret.length);
+        return ptr;
+    }
+
+    public static int getContentLength(byte[] bytes, int offset) {
+        return VarLenIntEncoderDecoder.decode(bytes, offset);
     }
 
-    public static int getFullLength(byte[] bytes, int offset){
-        return getLength(bytes, offset) + SIZE_OF_LENGTH;
+    public static int getNumberBytesToStoreMeta(int length) {
+        return VarLenIntEncoderDecoder.getBytesRequired(length);
     }
 
-    public static void putLength(int length, byte[] bytes, int offset) {
-        bytes[offset] = (byte) ((length >>> 8) & 0xFF);
-        bytes[offset + 1] = (byte) ((length >>> 0) & 0xFF);
+    /**
+     * Compute the normalized key of the byte array.
+     * The normalized key in Hyracks is mainly used to speedup the comparison between pointable data.
+     * In the ByteArray case, we compute the integer value by using the first 4 bytes.
+     * The comparator will first use this integer to get the result ( <,>, or =), it will check
+     * the actual bytes only if the normalized key is equal. Thus this normalized key must be
+     * consistent with the comparison result.
+     *
+     * @param bytesPtr
+     * @param start
+     * @return
+     */
+    public static int normalize(byte[] bytesPtr, int start) {
+        int len = getContentLength(bytesPtr, start);
+        long nk = 0;
+        start = start + getNumberBytesToStoreMeta(len);
+        for (int i = 0; i < 4; ++i) {
+            nk <<= 8;
+            if (i < len) {
+                nk |= bytesPtr[start + i] & 0xff;
+            }
+        }
+        return (int) (nk >> 1); // make it always positive.
     }
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/RawUTF8StringPointable.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/RawUTF8StringPointable.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/RawUTF8StringPointable.java
index 2b1f557..70bac4d 100644
--- a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/RawUTF8StringPointable.java
+++ b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/RawUTF8StringPointable.java
@@ -24,6 +24,7 @@ import org.apache.hyracks.data.std.api.IComparable;
 import org.apache.hyracks.data.std.api.IHashable;
 import org.apache.hyracks.data.std.api.IPointable;
 import org.apache.hyracks.data.std.api.IPointableFactory;
+import org.apache.hyracks.util.string.UTF8StringUtil;
 
 /**
  * This class provides the raw bytes-based comparison and hash function for UTF8 strings.
@@ -66,44 +67,16 @@ public final class RawUTF8StringPointable extends AbstractPointable implements I
 
     @Override
     public int compareTo(byte[] bytes, int start, int length) {
-        int utflen1 = UTF8StringPointable.getUTFLength(this.bytes, this.start);
-        int utflen2 = UTF8StringPointable.getUTFLength(bytes, start);
-
-        int c1 = 0;
-        int c2 = 0;
-
-        int s1Start = this.start + 2;
-        int s2Start = start + 2;
-
-        while (c1 < utflen1 && c2 < utflen2) {
-            char ch1 = (char) this.bytes[s1Start + c1];
-            char ch2 = (char) bytes[s2Start + c2];
-
-            if (ch1 != ch2) {
-                return ch1 - ch2;
-            }
-            c1++;
-            c2++;
-        }
-        return utflen1 - utflen2;
+        return UTF8StringUtil.rawByteCompareTo(this.bytes, this.start, bytes, start);
     }
 
     @Override
     public int hash() {
-        int h = 0;
-        int utflen = UTF8StringPointable.getUTFLength(bytes, start);
-        int sStart = start + 2;
-        int c = 0;
-
-        while (c < utflen) {
-            char ch = (char) bytes[sStart + c];
-            h = 31 * h + ch;
-            c++;
-        }
-        return h;
+        return UTF8StringUtil.rawBytehash(this.bytes, this.start);
     }
 
     public void toString(StringBuilder buffer) {
-        UTF8StringPointable.toString(buffer, bytes, start);
+        UTF8StringUtil.toString(buffer, bytes, start);
     }
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringLowercasePointable.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringLowercasePointable.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringLowercasePointable.java
new file mode 100644
index 0000000..6e4810c
--- /dev/null
+++ b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringLowercasePointable.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.data.std.primitive;
+
+import org.apache.hyracks.api.dataflow.value.ITypeTraits;
+import org.apache.hyracks.data.std.api.AbstractPointable;
+import org.apache.hyracks.data.std.api.IComparable;
+import org.apache.hyracks.data.std.api.IHashable;
+import org.apache.hyracks.data.std.api.IPointable;
+import org.apache.hyracks.data.std.api.IPointableFactory;
+import org.apache.hyracks.util.string.UTF8StringUtil;
+
+public final class UTF8StringLowercasePointable extends AbstractPointable implements IHashable, IComparable {
+    public static final ITypeTraits TYPE_TRAITS = new ITypeTraits() {
+        private static final long serialVersionUID = 1L;
+
+        @Override
+        public boolean isFixedLength() {
+            return false;
+        }
+
+        @Override
+        public int getFixedLength() {
+            return 0;
+        }
+    };
+
+    public static final IPointableFactory FACTORY = new IPointableFactory() {
+        private static final long serialVersionUID = 1L;
+
+        @Override
+        public IPointable createPointable() {
+            return new UTF8StringLowercasePointable();
+        }
+
+        @Override
+        public ITypeTraits getTypeTraits() {
+            return TYPE_TRAITS;
+        }
+    };
+
+    @Override
+    public int compareTo(IPointable pointer) {
+        return compareTo(pointer.getByteArray(), pointer.getStartOffset(), pointer.getLength());
+    }
+
+    @Override
+    public int compareTo(byte[] bytes, int start, int length) {
+        return UTF8StringUtil.lowerCaseCompareTo(this.bytes, this.start, bytes, start);
+    }
+
+    @Override
+    public int hash() {
+        return UTF8StringUtil.lowerCaseHash(bytes, start);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 8b41206..e311fa6 100644
--- a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -18,14 +18,42 @@
  */
 package org.apache.hyracks.data.std.primitive;
 
+import java.io.IOException;
+import java.nio.charset.Charset;
+
 import org.apache.hyracks.api.dataflow.value.ITypeTraits;
 import org.apache.hyracks.data.std.api.AbstractPointable;
 import org.apache.hyracks.data.std.api.IComparable;
 import org.apache.hyracks.data.std.api.IHashable;
 import org.apache.hyracks.data.std.api.IPointable;
 import org.apache.hyracks.data.std.api.IPointableFactory;
+import org.apache.hyracks.data.std.util.GrowableArray;
+import org.apache.hyracks.data.std.util.UTF8StringBuilder;
+import org.apache.hyracks.util.string.UTF8StringUtil;
 
 public final class UTF8StringPointable extends AbstractPointable implements IHashable, IComparable {
+
+    // These values are cached to speed up the length data access.
+    // Since we are using the variable-length encoding, we can save the repeated decoding efforts.
+    // WARNING: must call the resetConstants() method after each reset().
+    private int utf8Length;
+    private int metaLength;
+    private int hashValue;
+    private int stringLength;
+
+    /**
+     * reset those meta length.
+     * Since the {@code utf8Length} and the {@code metaLength} are often used, we compute those two values in advance.
+     * As for the {@code stringLength} and the {@code hashValue}, they will be lazily initialized after the first call.
+     */
+    @Override
+    protected void afterReset() {
+        utf8Length = UTF8StringUtil.getUTFLength(bytes, start);
+        metaLength = UTF8StringUtil.getNumBytesToStoreLength(getUTF8Length());
+        hashValue = 0;
+        stringLength = -1;
+    }
+
     public static final ITypeTraits TYPE_TRAITS = new ITypeTraits() {
         private static final long serialVersionUID = 1L;
 
@@ -54,170 +82,367 @@ public final class UTF8StringPointable extends AbstractPointable implements IHas
         }
     };
 
+    public static UTF8StringPointable generateUTF8Pointable(String string) {
+        byte[] bytes;
+        bytes = UTF8StringUtil.writeStringToBytes(string);
+        UTF8StringPointable ptr = new UTF8StringPointable();
+        ptr.set(bytes, 0, bytes.length);
+        return ptr;
+    }
+
     /**
      * Returns the character at the given byte offset. The caller is responsible for making sure that
      * the provided offset is within bounds and points to the beginning of a valid UTF8 character.
-     * 
-     * @param offset
-     *            - Byte offset
+     *
+     * @param offset - Byte offset
      * @return Character at the given offset.
      */
     public char charAt(int offset) {
-        return charAt(bytes, start + offset);
+        return UTF8StringUtil.charAt(bytes, start + offset);
     }
 
-    public static char charAt(byte[] b, int s) {
-        int c = b[s] & 0xff;
-        switch (c >> 4) {
-            case 0:
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 5:
-            case 6:
-            case 7:
-                return (char) c;
+    public int charSize(int offset) {
+        return UTF8StringUtil.charSize(bytes, start + offset);
+    }
 
-            case 12:
-            case 13:
-                return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
+    /**
+     * Gets the length of the string in characters.
+     * The first time call will need to go through the entire string, the following call will just return the pre-caculated result
+     *
+     * @return length of string in characters
+     */
+    public int getStringLength() {
+        if (stringLength < 0) {
+            stringLength = UTF8StringUtil.getStringLength(bytes, start);
+        }
+        return stringLength;
+    }
 
-            case 14:
-                return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
+    /**
+     * Gets the length of the UTF-8 encoded string in bytes.
+     *
+     * @return length of UTF-8 encoded string in bytes
+     */
+    public int getUTF8Length() {
+        return utf8Length;
+    }
 
-            default:
-                throw new IllegalArgumentException();
-        }
+    public int getMetaDataLength() {
+        return metaLength;
     }
 
-    public int charSize(int offset) {
-        return charSize(bytes, start + offset);
+    public int getCharStartOffset() {
+        return getStartOffset() + getMetaDataLength();
     }
 
-    public static int charSize(byte[] b, int s) {
-        int c = b[s] & 0xff;
-        switch (c >> 4) {
-            case 0:
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 5:
-            case 6:
-            case 7:
-                return 1;
+    @Override
+    public int compareTo(IPointable pointer) {
+        return compareTo(pointer.getByteArray(), pointer.getStartOffset(), pointer.getLength());
+    }
 
-            case 12:
-            case 13:
-                return 2;
+    @Override
+    public int compareTo(byte[] bytes, int start, int length) {
+        return UTF8StringUtil.compareTo(this.bytes, this.start, bytes, start);
+    }
 
-            case 14:
-                return 3;
+    @Override
+    public int hash() {
+        if (hashValue == 0) {
+            hashValue = UTF8StringUtil.hash(this.bytes, this.start);
         }
-        throw new IllegalStateException();
+        return hashValue;
     }
 
-    public static int getModifiedUTF8Len(char c) {
-        if (c >= 0x0000 && c <= 0x007F) {
-            return 1;
-        } else if (c <= 0x07FF) {
-            return 2;
-        } else {
-            return 3;
-        }
+    public void toString(StringBuilder buffer) {
+        UTF8StringUtil.toString(buffer, bytes, start);
     }
 
-    /**
-     * Gets the length of the string in characters.
-     * 
-     * @return length of string in characters
+    public String toString() {
+        return new String(this.bytes, this.getCharStartOffset(), this.getUTF8Length(), Charset.forName("UTF-8"));
+    }
+
+    /****
+     * String functions
      */
-    public int getStringLength() {
-        return getStringLength(bytes, start);
+
+    public int ignoreCaseCompareTo(UTF8StringPointable other) {
+        return UTF8StringUtil.lowerCaseCompareTo(this.getByteArray(), this.getStartOffset(),
+                other.getByteArray(), other.getStartOffset());
     }
 
-    public static int getStringLength(byte[] b, int s) {
-        int pos = s + 2;
-        int end = pos + getUTFLength(b, s);
-        int charCount = 0;
-        while (pos < end) {
-            charCount++;
-            pos += charSize(b, pos);
-        }
-        return charCount;
+    public int find(UTF8StringPointable pattern, boolean ignoreCase) {
+        return find(this, pattern, ignoreCase);
     }
 
     /**
-     * Gets the length of the UTF-8 encoded string in bytes.
-     * 
-     * @return length of UTF-8 encoded string in bytes
+     * return the byte offset of the first character of the matching string. Not including the MetaLength
+     *
+     * @param src
+     * @param pattern
+     * @param ignoreCase
+     * @return
      */
-    public int getUTFLength() {
-        return getUTFLength(bytes, start);
+    public static int find(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase) {
+        final int srcUtfLen = src.getUTF8Length();
+        final int pttnUtfLen = pattern.getUTF8Length();
+        final int srcStart = src.getMetaDataLength();
+        final int pttnStart = pattern.getMetaDataLength();
+
+        int startMatch = 0;
+        int maxStart = srcUtfLen - pttnUtfLen;
+        while (startMatch <= maxStart) {
+            int c1 = startMatch;
+            int c2 = 0;
+            while (c1 < srcUtfLen && c2 < pttnUtfLen) {
+                char ch1 = src.charAt(srcStart + c1);
+                char ch2 = pattern.charAt(pttnStart + c2);
+
+                if (ch1 != ch2) {
+                    if (!ignoreCase || ignoreCase && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) {
+                        break;
+                    }
+                }
+                c1 += src.charSize(srcStart + c1);
+                c2 += pattern.charSize(pttnStart + c2);
+            }
+            if (c2 == pttnUtfLen) {
+                return startMatch;
+            }
+            startMatch += src.charSize(srcStart + startMatch);
+        }
+        return -1;
     }
 
-    public static int getUTFLength(byte[] b, int s) {
-        return ((b[s] & 0xff) << 8) + ((b[s + 1] & 0xff) << 0);
+    public boolean contains(UTF8StringPointable pattern, boolean ignoreCase) {
+        return contains(this, pattern, ignoreCase);
     }
 
-    @Override
-    public int compareTo(IPointable pointer) {
-        return compareTo(pointer.getByteArray(), pointer.getStartOffset(), pointer.getLength());
+    public static boolean contains(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase) {
+        return find(src, pattern, ignoreCase) >= 0;
     }
 
-    @Override
-    public int compareTo(byte[] bytes, int start, int length) {
-        int utflen1 = getUTFLength(this.bytes, this.start);
-        int utflen2 = getUTFLength(bytes, start);
+    public boolean startsWith(UTF8StringPointable pattern, boolean ignoreCase) {
+        return startsWith(this, pattern, ignoreCase);
+    }
+
+    public static boolean startsWith(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase) {
+        int utflen1 = src.getUTF8Length();
+        int utflen2 = pattern.getUTF8Length();
+        if (utflen2 > utflen1)
+            return false;
+
+        int s1Start = src.getMetaDataLength();
+        int s2Start = pattern.getMetaDataLength();
 
         int c1 = 0;
         int c2 = 0;
+        while (c1 < utflen1 && c2 < utflen2) {
+            char ch1 = src.charAt(s1Start + c1);
+            char ch2 = pattern.charAt(s2Start + c2);
+            if (ch1 != ch2) {
+                if (!ignoreCase || ignoreCase && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) {
+                    break;
+                }
+            }
+            c1 += src.charSize(s1Start + c1);
+            c2 += pattern.charSize(s2Start + c2);
+        }
+        return (c2 == utflen2);
+    }
 
-        int s1Start = this.start + 2;
-        int s2Start = start + 2;
+    public boolean endsWith(UTF8StringPointable pattern, boolean ignoreCase) {
+        return endsWith(this, pattern, ignoreCase);
+    }
 
-        while (c1 < utflen1 && c2 < utflen2) {
-            char ch1 = charAt(this.bytes, s1Start + c1);
-            char ch2 = charAt(bytes, s2Start + c2);
+    public static boolean endsWith(UTF8StringPointable src, UTF8StringPointable pattern, boolean ignoreCase) {
+        int len1 = src.getUTF8Length();
+        int len2 = pattern.getUTF8Length();
+        if (len2 > len1)
+            return false;
+
+        int s1Start = src.getMetaDataLength();
+        int s2Start = pattern.getMetaDataLength();
+
+        int c1 = len1 - len2;
+        int c2 = 0;
+        while (c1 < len1 && c2 < len2) {
+            char ch1 = src.charAt(s1Start + c1);
+            char ch2 = pattern.charAt(s2Start + c2);
 
             if (ch1 != ch2) {
-                return ch1 - ch2;
+                if (!ignoreCase || ignoreCase && Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) {
+                    break;
+                }
             }
-            c1 += charSize(this.bytes, s1Start + c1);
-            c2 += charSize(bytes, s2Start + c2);
+            c1 += src.charSize(s1Start + c1);
+            c2 += pattern.charSize(s2Start + c2);
         }
-        return utflen1 - utflen2;
+        return (c2 == len2);
     }
 
-    @Override
-    public int hash() {
-        int h = 0;
-        int utflen = getUTFLength(bytes, start);
-        int sStart = start + 2;
-        int c = 0;
+    public void concat(UTF8StringPointable next, UTF8StringBuilder builder, GrowableArray out) throws IOException {
+        concat(this, next, builder, out);
+    }
+
+    public static void concat(UTF8StringPointable first, UTF8StringPointable next, UTF8StringBuilder builder,
+            GrowableArray out) throws IOException {
+        int firstUtfLen = first.getUTF8Length();
+        int nextUtfLen = next.getUTF8Length();
+
+        builder.reset(out, firstUtfLen + nextUtfLen);
+        builder.appendUtf8StringPointable(first);
+        builder.appendUtf8StringPointable(next);
+        builder.finish();
+    }
+
+    public void substr(int charOffset, int charLength, UTF8StringBuilder builder, GrowableArray out)
+            throws IOException {
+        substr(this, charOffset, charLength, builder, out);
+    }
+
+    public static void substr(UTF8StringPointable src, int charOffset, int charLength, UTF8StringBuilder builder,
+            GrowableArray out) throws IOException {
+        // Really don't understand why we need to support the charOffset < 0 case.
+        // At this time, usually there is mistake on user side, we'd better give him a warning.
+        // assert charOffset >= 0;
+        if (charOffset < 0) {
+            charOffset = 0;
+        }
+        if (charLength < 0) {
+            charLength = 0;
+        }
+
+        int utfLen = src.getUTF8Length();
+        int chIdx = 0;
+        int byteIdx = 0;
+        while (byteIdx < utfLen && chIdx < charOffset) {
+            byteIdx += src.charSize(src.getMetaDataLength() + byteIdx);
+            chIdx++;
+        }
+        if (byteIdx >= utfLen) {
+            // Again, why do we tolerant this kind of mistakes?
+            // throw new StringIndexOutOfBoundsException(charOffset);
+            builder.reset(out, 0);
+            builder.finish();
+            return;
+        }
+
+        builder.reset(out, Math.min(utfLen - byteIdx, (int) (charLength * 1.0 * byteIdx / chIdx)));
+        chIdx = 0;
+        while (byteIdx < utfLen && chIdx < charLength) {
+            builder.appendChar(src.charAt(src.getMetaDataLength() + byteIdx));
+            chIdx++;
+            byteIdx += src.charSize(src.getMetaDataLength() + byteIdx);
+        }
+        builder.finish();
+    }
+
+    public void substrBefore(UTF8StringPointable match, UTF8StringBuilder builder, GrowableArray out)
+            throws IOException {
+        substrBefore(this, match, builder, out);
+    }
 
-        while (c < utflen) {
-            char ch = charAt(bytes, sStart + c);
-            h = 31 * h + ch;
-            c += charSize(bytes, sStart + c);
+    /**
+     * Write the substring before the given pattern. It will write a empty string if the matching fails.
+     *
+     * @param src
+     * @param match
+     * @param builder
+     * @param out
+     * @throws IOException
+     */
+    public static void substrBefore(
+            UTF8StringPointable src,
+            UTF8StringPointable match,
+            UTF8StringBuilder builder,
+            GrowableArray out) throws IOException {
+
+        int byteOffset = find(src, match, false);
+        if (byteOffset < 0) {
+            builder.reset(out, 0);
+            builder.finish();
+            return;
         }
-        return h;
+
+        final int srcMetaLen = src.getMetaDataLength();
+
+        builder.reset(out, byteOffset);
+        for (int idx = 0; idx < byteOffset; ) {
+            builder.appendChar(src.charAt(srcMetaLen + idx));
+            idx += src.charSize(srcMetaLen + idx);
+        }
+        builder.finish();
+    }
+
+    public void substrAfter(UTF8StringPointable match, UTF8StringBuilder builder, GrowableArray out)
+            throws IOException {
+        substrAfter(this, match, builder, out);
     }
 
-    public static void toString(StringBuilder buffer, byte[] bytes, int start) {
-        int utfLen = getUTFLength(bytes, start);
-        int offset = 2;
-        while (utfLen > 0) {
-            char c = charAt(bytes, start + offset);
-            buffer.append(c);
-            int cLen = UTF8StringPointable.getModifiedUTF8Len(c);
-            offset += cLen;
-            utfLen -= cLen;
+    /**
+     * Write the substring after the given pattern. It will write a empty string if the matching fails.
+     *
+     * @param src
+     * @param match
+     * @param builder
+     * @param out
+     */
+    public static void substrAfter(
+            UTF8StringPointable src,
+            UTF8StringPointable match,
+            UTF8StringBuilder builder,
+            GrowableArray out) throws IOException {
+
+        int byteOffset = find(src, match, false);
+        if (byteOffset < 0) {
+            builder.reset(out, 0);
+            builder.finish();
+            return;
         }
+
+        final int srcUtfLen = src.getUTF8Length();
+        final int matchUtfLen = match.getUTF8Length();
+
+        final int resultLen = srcUtfLen - byteOffset - matchUtfLen;
+        builder.reset(out, resultLen);
+        builder.appendUtf8StringPointable(src, src.getCharStartOffset() + byteOffset + matchUtfLen, resultLen);
+        builder.finish();
     }
 
-    public void toString(StringBuilder buffer) {
-        toString(buffer, bytes, start);
+    public void lowercase(UTF8StringBuilder builder, GrowableArray out) throws IOException {
+        lowercase(this, builder, out);
+    }
+
+    public static void lowercase(UTF8StringPointable src, UTF8StringBuilder builder, GrowableArray out)
+            throws IOException {
+        final int srcUtfLen = src.getUTF8Length();
+        final int srcStart = src.getMetaDataLength();
+
+        builder.reset(out, srcUtfLen);
+        int byteIndex = 0;
+        while (byteIndex < srcUtfLen) {
+            builder.appendChar(Character.toLowerCase(src.charAt(srcStart + byteIndex)));
+            byteIndex += src.charSize(srcStart + byteIndex);
+        }
+        builder.finish();
     }
+
+    public void uppercase(UTF8StringBuilder builder, GrowableArray out) throws IOException {
+        uppercase(this, builder, out);
+    }
+
+    public static void uppercase(UTF8StringPointable src, UTF8StringBuilder builder, GrowableArray out)
+            throws IOException {
+        final int srcUtfLen = src.getUTF8Length();
+        final int srcStart = src.getMetaDataLength();
+
+        builder.reset(out, srcUtfLen);
+        int byteIndex = 0;
+        while (byteIndex < srcUtfLen) {
+            builder.appendChar(Character.toUpperCase(src.charAt(srcStart + byteIndex)));
+            byteIndex += src.charSize(srcStart + byteIndex);
+        }
+        builder.finish();
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringWriter.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringWriter.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringWriter.java
deleted file mode 100644
index ae7e903..0000000
--- a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringWriter.java
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.hyracks.data.std.primitive;
-
-import java.io.DataOutput;
-import java.io.IOException;
-import java.io.UTFDataFormatException;
-
-public class UTF8StringWriter {
-    private byte[] tempBytes;
-
-    public void writeUTF8String(CharSequence str, DataOutput out) throws IOException {
-        int strlen = str.length();
-        int utflen = 0;
-        int c, count = 0;
-
-        for (int i = 0; i < strlen; i++) {
-            c = str.charAt(i);
-            if ((c >= 0x0001) && (c <= 0x007F)) {
-                utflen++;
-            } else if (c > 0x07FF) {
-                utflen += 3;
-            } else {
-                utflen += 2;
-            }
-        }
-
-        if (utflen > 65535) {
-            throw new UTFDataFormatException("encoded string too long: " + utflen + " bytes");
-        }
-
-        if (tempBytes == null || tempBytes.length < utflen + 2) {
-            tempBytes = new byte[utflen + 2];
-        }
-
-        tempBytes[count++] = (byte) ((utflen >>> 8) & 0xFF);
-        tempBytes[count++] = (byte) ((utflen >>> 0) & 0xFF);
-
-        int i = 0;
-        for (i = 0; i < strlen; i++) {
-            c = str.charAt(i);
-            if (!((c >= 0x0001) && (c <= 0x007F))) {
-                break;
-            }
-            tempBytes[count++] = (byte) c;
-        }
-
-        for (; i < strlen; i++) {
-            c = str.charAt(i);
-            if ((c >= 0x0001) && (c <= 0x007F)) {
-                tempBytes[count++] = (byte) c;
-            } else if (c > 0x07FF) {
-                tempBytes[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
-                tempBytes[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
-                tempBytes[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
-            } else {
-                tempBytes[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
-                tempBytes[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
-            }
-        }
-        out.write(tempBytes, 0, utflen + 2);
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/AbstractVarLenObjectBuilder.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/AbstractVarLenObjectBuilder.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/AbstractVarLenObjectBuilder.java
new file mode 100644
index 0000000..452710e
--- /dev/null
+++ b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/AbstractVarLenObjectBuilder.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.data.std.util;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+
+/**
+ * This builder is used to build the variable length encoding object (e.g. UTF8String or ByteArray).
+ * The caller needs to give an estimated length when {@link #reset(GrowableArray, int)}.
+ * Then it can append the content byte by byte.
+ * Since the actual byte length to store the content length is not precise at the beginning, the caller need
+ * to explicitly call the {@link #finish()} function to notify that one object has finished building.
+ * Then internally this builder will take care of storing the actual length field at the beginning of the
+ * given storage array.
+ */
+public abstract class AbstractVarLenObjectBuilder {
+    protected GrowableArray ary;
+    protected DataOutput out;
+    protected int startOffset;
+    protected int estimateMetaLen;
+
+    /**
+     * Start to build an variable length object
+     *
+     * @param ary            the destination storage array
+     * @param estimateLength the estimate length of this object
+     * @throws IOException
+     */
+    public void reset(GrowableArray ary, int estimateLength) throws IOException {
+        this.ary = ary;
+        this.out = ary.getDataOutput();
+        this.startOffset = ary.getLength();
+        this.estimateMetaLen = VarLenIntEncoderDecoder.getBytesRequired(estimateLength);
+
+        // increase the offset
+        for (int i = 0; i < estimateMetaLen; i++) {
+            out.writeByte(0);
+        }
+    }
+
+    /**
+     * Finish building an variable length object.
+     * It will write the correct length of the object at the beginning of the storage array.
+     * Since the actual byte size for storing the length could be changed ( if the given estimated length varies too
+     * far from the actual length), we need to shift the data around in some cases.
+     * Specifically, if the varlength(actual length) > varlength(estimated length) we need to grow the storage and
+     * shift the content rightward. Else we need to shift the data leftward and tell the storage to rewind the
+     * difference to mark the correct position.
+     *
+     * @throws IOException
+     */
+    public void finish() throws IOException {
+        int actualDataLength = ary.getLength() - startOffset - estimateMetaLen;
+        int actualMetaLen = VarLenIntEncoderDecoder.getBytesRequired(actualDataLength);
+        if (actualMetaLen != estimateMetaLen) {// ugly but rare situation if the estimate vary a lot
+            int diff = estimateMetaLen - actualMetaLen;
+            int actualDataStart = startOffset + actualMetaLen;
+            if (diff > 0) { // shrink
+                for (int i = 0; i < actualDataLength; i++) {
+                    ary.getByteArray()[actualDataStart + i] = ary.getByteArray()[actualDataStart + i + diff];
+                }
+                ary.rewindPositionBy(diff);
+            } else { // increase space
+                diff = -diff;
+                for (int i = 0; i < diff; i++) {
+                    out.writeByte(0);
+                }
+                for (int i = ary.getLength() - 1; i >= actualDataStart + diff; i--) {
+                    ary.getByteArray()[i] = ary.getByteArray()[i - diff];
+                }
+            }
+        }
+        VarLenIntEncoderDecoder.encode(actualDataLength, ary.getByteArray(), startOffset);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ByteArrayAccessibleOutputStream.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ByteArrayAccessibleOutputStream.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ByteArrayAccessibleOutputStream.java
index 2f1ad1d..287e2f2 100644
--- a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ByteArrayAccessibleOutputStream.java
+++ b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ByteArrayAccessibleOutputStream.java
@@ -41,6 +41,26 @@ public class ByteArrayAccessibleOutputStream extends ByteArrayOutputStream {
         count += 1;
     }
 
+    /**
+     * Rewind the current position by {@code delta} to a previous position.
+     * This function is used to drop the already written delta bytes.
+     * In some cases, we write some bytes, and afterward we found we've written more than expected.
+     * Then we need to fix the position by rewind the current position to the expected one.
+     *
+     * Currently, it is used by the {@link AbstractVarLenObjectBuilder} which may take more space than required
+     * at beginning, and it will shift the data and fix the position whenever required.
+     *
+     * It will throw {@link IndexOutOfBoundsException} if the {@code delta} is negative.
+     * Evil function, use with caution.
+     * @param delta
+     */
+    public void rewindPositionBy(int delta) {
+        if (delta < 0 || count < delta) {
+            throw new IndexOutOfBoundsException();
+        }
+        count -= delta;
+    }
+
     @Override
     public void write(byte[] b, int off, int len) {
         if ((off < 0) || (off > b.length) || (len < 0) || ((off + len) - b.length > 0)) {
@@ -60,9 +80,8 @@ public class ByteArrayAccessibleOutputStream extends ByteArrayOutputStream {
     /**
      * Increases the capacity to ensure that it can hold at least the
      * number of elements specified by the minimum capacity argument.
-     * 
-     * @param minCapacity
-     *            the desired minimum capacity
+     *
+     * @param minCapacity the desired minimum capacity
      */
     private void grow(int minCapacity) {
         // overflow-conscious code

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ByteArrayBuilder.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ByteArrayBuilder.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ByteArrayBuilder.java
new file mode 100644
index 0000000..61b15d4
--- /dev/null
+++ b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ByteArrayBuilder.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.data.std.util;
+
+import java.io.IOException;
+
+public class ByteArrayBuilder extends AbstractVarLenObjectBuilder {
+
+    public void appendByte(byte b) throws IOException {
+        out.writeByte(b);
+    }
+
+    public void appendBytes(byte[] bytes, int start, int length) throws IOException {
+        out.write(bytes, start, length);
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/GrowableArray.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/GrowableArray.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/GrowableArray.java
index d08412e..6e329ab 100644
--- a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/GrowableArray.java
+++ b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/GrowableArray.java
@@ -20,7 +20,6 @@
 package org.apache.hyracks.data.std.util;
 
 import java.io.DataOutput;
-import java.io.DataOutputStream;
 import java.io.IOException;
 
 import org.apache.hyracks.data.std.api.IDataOutputProvider;
@@ -28,7 +27,7 @@ import org.apache.hyracks.data.std.api.IValueReference;
 
 public class GrowableArray implements IDataOutputProvider {
     private final ByteArrayAccessibleOutputStream baaos = new ByteArrayAccessibleOutputStream();
-    private final DataOutputStream dos = new DataOutputStream(baaos);
+    private final RewindableDataOutputStream dos = new RewindableDataOutputStream(baaos);
 
     @Override
     public DataOutput getDataOutput() {
@@ -39,6 +38,24 @@ public class GrowableArray implements IDataOutputProvider {
         baaos.reset();
     }
 
+    /**
+     * Rewind the current position by {@code delta} to a previous position.
+     * This function is used to drop the already written delta bytes.
+     * In some cases, we write some bytes, and afterward we found we've written more than expected.
+     * Then we need to fix the position by rewind the current position to the expected one.
+     *
+     * Currently, it is used by the {@link AbstractVarLenObjectBuilder} which may take more space than required
+     * at beginning, and it will shift the data and fix the position whenever required.
+     * It will throw {@link IndexOutOfBoundsException} if the {@code delta} is negative.
+     * Evil function, use with caution.
+     *
+     * @param delta
+     */
+    public void rewindPositionBy(int delta) {
+        baaos.rewindPositionBy(delta);
+        dos.rewindWrittenBy(delta);
+    }
+
     public byte[] getByteArray() {
         return baaos.getByteArray();
     }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ICharIterator.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ICharIterator.java b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ICharIterator.java
new file mode 100644
index 0000000..118893b
--- /dev/null
+++ b/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/util/ICharIterator.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.data.std.util;
+
+public interface ICharIterator {
+
+    boolean hasNext();
+
+    char next();
+}