You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by ji...@apache.org on 2015/10/29 01:25:04 UTC
[2/7] incubator-asterixdb-hyracks git commit: ASTERIXDB-1102: VarSize Encoding to store length of String and ByteArray

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java
index f372dbe..6e764c3 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java
+++ b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java
@@ -20,21 +20,19 @@
 package org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers;
 
 import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
 import java.io.DataInput;
 import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 
+import org.apache.hyracks.data.std.util.GrowableArray;
+import org.apache.hyracks.util.string.UTF8StringReader;
+import org.apache.hyracks.util.string.UTF8StringUtil;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
-import org.apache.hyracks.data.std.util.GrowableArray;
-
 public class NGramTokenizerTest {
 
     private char PRECHAR = '#';
@@ -72,11 +70,7 @@ public class NGramTokenizerTest {
 
     @Before
     public void init() throws Exception {
-        // serialize string into bytes
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        DataOutput dos = new DataOutputStream(baos);
-        dos.writeUTF(str);
-        inputBuffer = baos.toByteArray();
+        inputBuffer = UTF8StringUtil.writeStringToBytes(str);
     }
 
     void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
@@ -192,7 +186,8 @@ public class NGramTokenizerTest {
             ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
             DataInput in = new DataInputStream(bais);
 
-            String strGram = in.readUTF();
+            UTF8StringReader reader = new UTF8StringReader();
+            String strGram = reader.readUTF(in);
 
             // System.out.println("\"" + strGram + "\"");
 

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java
index c42022e..78ba6a3 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java
+++ b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java
@@ -20,21 +20,19 @@
 package org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers;
 
 import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
 import java.io.DataInput;
 import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 
-import junit.framework.Assert;
-
+import org.apache.hyracks.data.std.util.GrowableArray;
+import org.apache.hyracks.util.string.UTF8StringReader;
+import org.apache.hyracks.util.string.UTF8StringUtil;
 import org.junit.Before;
 import org.junit.Test;
 
-import org.apache.hyracks.data.std.util.GrowableArray;
+import junit.framework.Assert;
 
 public class WordTokenizerTest {
 
@@ -46,7 +44,8 @@ public class WordTokenizerTest {
     private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
 
     private boolean isSeparator(char c) {
-        return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
+        return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER
+                || Character.getType(c) == Character.OTHER_NUMBER);
     }
 
     private void tokenize(String text, ArrayList<String> tokens) {
@@ -78,10 +77,7 @@ public class WordTokenizerTest {
     @Before
     public void init() throws IOException {
         // serialize text into bytes
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        DataOutput dos = new DataOutputStream(baos);
-        dos.writeUTF(text);
-        inputBuffer = baos.toByteArray();
+        inputBuffer = UTF8StringUtil.writeStringToBytes(text);
 
         // init expected string tokens
         tokenize(text, expectedUTF8Tokens);
@@ -144,7 +140,8 @@ public class WordTokenizerTest {
     public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
 
         HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
-        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false,
+                tokenFactory);
 
         tokenizer.reset(inputBuffer, 0, inputBuffer.length);
 
@@ -175,7 +172,8 @@ public class WordTokenizerTest {
     public void testWordTokenizerWithUTF8Tokens() throws IOException {
 
         UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
-        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false,
+                tokenFactory);
 
         tokenizer.reset(inputBuffer, 0, inputBuffer.length);
 
@@ -194,7 +192,8 @@ public class WordTokenizerTest {
             ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
             DataInput in = new DataInputStream(bais);
 
-            String strToken = in.readUTF();
+            UTF8StringReader reader = new UTF8StringReader();
+            String strToken = reader.readUTF(in);
 
             Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
 

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
index 36f615f..fd94870 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
+++ b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
@@ -88,7 +88,7 @@ public class LSMInvertedIndexTestUtils {
         fieldGens[0] = new DocumentStringFieldValueGenerator(2, 10, 10000, rnd);
         fieldGens[1] = new SortedIntegerFieldValueGenerator(0);
         ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
-                UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
+                new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE };
         TupleGenerator tupleGen = new TupleGenerator(fieldGens, fieldSerdes, 0);
         return tupleGen;
     }
@@ -98,7 +98,7 @@ public class LSMInvertedIndexTestUtils {
         fieldGens[0] = new PersonNameFieldValueGenerator(rnd, 0.5f);
         fieldGens[1] = new SortedIntegerFieldValueGenerator(0);
         ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
-                UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
+                new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE };
         TupleGenerator tupleGen = new TupleGenerator(fieldGens, fieldSerdes, 0);
         return tupleGen;
     }
@@ -110,7 +110,7 @@ public class LSMInvertedIndexTestUtils {
             case INMEMORY:
             case ONDISK:
             case LSM: {
-                fieldSerdes = new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE,
+                fieldSerdes = new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer(),
                         IntegerSerializerDeserializer.INSTANCE };
                 break;
             }
@@ -118,7 +118,7 @@ public class LSMInvertedIndexTestUtils {
             case PARTITIONED_ONDISK:
             case PARTITIONED_LSM: {
                 // Such indexes also include the set-size for partitioning.
-                fieldSerdes = new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE,
+                fieldSerdes = new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer(),
                         ShortSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
                 break;
             }

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/pom.xml
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/pom.xml b/hyracks/hyracks-util/pom.xml
new file mode 100644
index 0000000..ca38040
--- /dev/null
+++ b/hyracks/hyracks-util/pom.xml
@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one
+  ~ or more contributor license agreements.  See the NOTICE file
+  ~ distributed with this work for additional information
+  ~ regarding copyright ownership.  The ASF licenses this file
+  ~ to you under the Apache License, Version 2.0 (the
+  ~ "License"); you may not use this file except in compliance
+  ~ with the License.  You may obtain a copy of the License at
+  ~
+  ~   http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing,
+  ~ software distributed under the License is distributed on an
+  ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  ~ KIND, either express or implied.  See the License for the
+  ~ specific language governing permissions and limitations
+  ~ under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <parent>
+        <artifactId>hyracks</artifactId>
+        <groupId>org.apache.hyracks</groupId>
+        <version>0.2.17-SNAPSHOT</version>
+    </parent>
+
+    <modelVersion>4.0.0</modelVersion>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <version>2.6</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>test-jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+
+    <artifactId>hyracks-util</artifactId>
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+        </dependency>
+    </dependencies>
+
+
+</project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Parser.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Parser.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Parser.java
new file mode 100644
index 0000000..257daee
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Parser.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.bytes;
+
+import java.util.Arrays;
+
+public class Base64Parser {
+    private static final byte[] DECODE_MAP = initDecodeMap();
+    private static final byte PADDING = 127;
+
+    private static byte[] initDecodeMap() {
+        byte[] map = new byte[128];
+        Arrays.fill(map, (byte) -1);
+
+        int i;
+        for (i = 'A'; i <= 'Z'; i++) {
+            map[i] = (byte) (i - 'A');
+        }
+        for (i = 'a'; i <= 'z'; i++) {
+            map[i] = (byte) (i - 'a' + 26);
+        }
+        for (i = '0'; i <= '9'; i++) {
+            map[i] = (byte) (i - '0' + 52);
+        }
+        map['+'] = 62;
+        map['/'] = 63;
+        map['='] = PADDING;
+
+        return map;
+    }
+
+    private byte[] quadruplet = new byte[4];
+    private byte[] storage;
+    private int length = 0;
+
+    /**
+     * Parse the Base64 sequence from {@code input} into {@code out}
+     * Note, the out should have enough space by checking the {@link #guessLength(char[], int, int)} first
+     *
+     * @param input
+     * @param start
+     * @param length
+     * @param out
+     * @param offset
+     * @return
+     */
+    public int parseBase64String(char[] input, int start, int length, byte[] out, int offset) {
+        int outLength = 0;
+
+        int i;
+        int q = 0;
+
+        // convert each quadruplet to three bytes.
+        for (i = 0; i < length; i++) {
+            char ch = input[start + i];
+            byte v = DECODE_MAP[ch];
+
+            if (v == -1) {
+                throw new IllegalArgumentException("Invalid Base64 character");
+            }
+            quadruplet[q++] = v;
+
+            if (q == 4) {
+                outLength += dumpQuadruplet(out, offset + outLength);
+                q = 0;
+            }
+        }
+
+        return outLength;
+    }
+
+    /**
+     * Parse the Base64 sequence from {@code input} into {@code out}
+     * Note, the out should have enough space by checking the {@link #guessLength(byte[], int, int)} first
+     *
+     * @param input
+     * @param start
+     * @param length
+     * @param out
+     * @param offset
+     * @return the number of written bytes
+     */
+    public int parseBase64String(byte[] input, int start, int length, byte[] out, int offset) {
+        int outLength = 0;
+
+        int i;
+        int q = 0;
+
+        // convert each quadruplet to three bytes.
+        for (i = 0; i < length; i++) {
+            char ch = (char) input[start + i];
+            byte v = DECODE_MAP[ch];
+
+            if (v == -1) {
+                throw new IllegalArgumentException("Invalid Base64 character");
+            }
+            quadruplet[q++] = v;
+
+            if (q == 4) {
+                outLength += dumpQuadruplet(out, offset + outLength);
+                q = 0;
+            }
+        }
+
+        return outLength;
+    }
+
+    /**
+     * computes the length of binary data speculatively.
+     * Our requirement is to create byte[] of the exact length to store the binary data.
+     * If we do this in a straight-forward way, it takes two passes over the data.
+     * Experiments show that this is a non-trivial overhead (35% or so is spent on
+     * the first pass in calculating the length.)
+     * So the approach here is that we compute the length speculatively, without looking
+     * at the whole contents. The obtained speculative value is never less than the
+     * actual length of the binary data, but it may be bigger. So if the speculation
+     * goes wrong, we'll pay the cost of reallocation and buffer copying.
+     * If the base64 text is tightly packed with no indentation nor illegal char
+     * (like what most web services produce), then the speculation of this method
+     * will be correct, so we get the performance benefit.
+     */
+    public static int guessLength(char[] chars, int start, int length) {
+
+        // compute the tail '=' chars
+        int j = length - 1;
+        for (; j >= 0; j--) {
+            byte code = DECODE_MAP[chars[start + j]];
+            if (code == PADDING) {
+                continue;
+            }
+            if (code == -1) // most likely this base64 text is indented. go with the upper bound
+            {
+                return length / 4 * 3;
+            }
+            break;
+        }
+
+        j++;    // text.charAt(j) is now at some base64 char, so +1 to make it the size
+        int padSize = length - j;
+        if (padSize > 2) // something is wrong with base64. be safe and go with the upper bound
+        {
+            return length / 4 * 3;
+        }
+
+        // so far this base64 looks like it's unindented tightly packed base64.
+        // take a chance and create an array with the expected size
+        return length / 4 * 3 - padSize;
+    }
+
+    public static int guessLength(byte[] chars, int start, int length) {
+
+        // compute the tail '=' chars
+        int j = length - 1;
+        for (; j >= 0; j--) {
+            byte code = DECODE_MAP[chars[start + j]];
+            if (code == PADDING) {
+                continue;
+            }
+            if (code == -1) // most likely this base64 text is indented. go with the upper bound
+            {
+                return length / 4 * 3;
+            }
+            break;
+        }
+
+        j++;    // text.charAt(j) is now at some base64 char, so +1 to make it the size
+        int padSize = length - j;
+        if (padSize > 2) // something is wrong with base64. be safe and go with the upper bound
+        {
+            return length / 4 * 3;
+        }
+
+        // so far this base64 looks like it's unindented tightly packed base64.
+        // take a chance and create an array with the expected size
+        return length / 4 * 3 - padSize;
+    }
+
+    public byte[] getByteArray() {
+        return storage;
+    }
+
+    public int getLength() {
+        return length;
+    }
+
+    /**
+     * Same as {@link #parseBase64String(byte[], int, int, byte[], int)}, but we will provide the storage for caller
+     *
+     * @param input
+     * @param start
+     * @param length
+     */
+    public void generatePureByteArrayFromBase64String(byte[] input, int start, int length) {
+        // The base64 character length equals to utf8length
+        if (length % 4 != 0) {
+            throw new IllegalArgumentException(
+                    "Invalid Base64 string, the length of the string should be a multiple of 4");
+        }
+        final int buflen = guessLength(input, start, length);
+        ensureCapacity(buflen);
+        this.length = parseBase64String(input, start, length, storage, 0);
+    }
+
+    public void generatePureByteArrayFromBase64String(char[] input, int start, int length) {
+        if (length % 4 != 0) {
+            throw new IllegalArgumentException(
+                    "Invalid Base64 string, the length of the string should be a multiple of 4");
+        }
+        final int buflen = guessLength(input, start, length);
+        ensureCapacity(buflen);
+        this.length = parseBase64String(input, start, length, storage, 0);
+    }
+
+    private void ensureCapacity(int length) {
+        if (storage == null || storage.length < length) {
+            storage = new byte[length];
+        }
+    }
+
+    private int dumpQuadruplet(byte[] out, int offset) {
+        int outLength = 0;
+        // quadruplet is now filled.
+        out[offset + outLength++] = (byte) ((quadruplet[0] << 2) | (quadruplet[1] >> 4));
+        if (quadruplet[2] != PADDING) {
+            out[offset + outLength++] = (byte) ((quadruplet[1] << 4) | (quadruplet[2] >> 2));
+        }
+        if (quadruplet[3] != PADDING) {
+            out[offset + outLength++] = (byte) ((quadruplet[2] << 6) | (quadruplet[3]));
+        }
+        return outLength;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Printer.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Printer.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Printer.java
new file mode 100644
index 0000000..0e1c078
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Printer.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.bytes;
+
+import java.io.IOException;
+
+public class Base64Printer {
+    /**
+     * Encodes a byte array into a {@code Appendable} stream by doing base64 encoding.
+     *
+     * @return the same input stream.
+     */
+    public static Appendable printBase64Binary(byte[] input, int offset, int len, Appendable appendable)
+            throws IOException {
+        // encode elements until only 1 or 2 elements are left to encode
+        int remaining = len;
+        int i;
+        for (i = offset; remaining >= 3; remaining -= 3, i += 3) {
+            appendable.append(encode(input[i] >> 2));
+            appendable.append(encode(
+                    ((input[i] & 0x3) << 4)
+                            | ((input[i + 1] >> 4) & 0xF)));
+            appendable.append(encode(
+                    ((input[i + 1] & 0xF) << 2)
+                            | ((input[i + 2] >> 6) & 0x3)));
+            appendable.append(encode(input[i + 2] & 0x3F));
+        }
+        // encode when exactly 1 element (left) to encode
+        if (remaining == 1) {
+            appendable.append(encode(input[i] >> 2));
+            appendable.append(encode(((input[i]) & 0x3) << 4));
+            appendable.append('=');
+            appendable.append('=');
+        }
+        // encode when exactly 2 elements (left) to encode
+        if (remaining == 2) {
+            appendable.append(encode(input[i] >> 2));
+            appendable.append(encode(((input[i] & 0x3) << 4)
+                    | ((input[i + 1] >> 4) & 0xF)));
+            appendable.append(encode((input[i + 1] & 0xF) << 2));
+            appendable.append('=');
+        }
+        return appendable;
+    }
+
+    /**
+     * Encodes a byte array into a char array by doing base64 encoding.
+     * The caller must supply a big enough buffer.
+     *
+     * @return the value of {@code ptr+((len+2)/3)*4}, which is the new offset
+     * in the output buffer where the further bytes should be placed.
+     */
+    public static int printBase64Binary(byte[] input, int offset, int len, char[] buf, int ptr) {
+        // encode elements until only 1 or 2 elements are left to encode
+        int remaining = len;
+        int i;
+        for (i = offset; remaining >= 3; remaining -= 3, i += 3) {
+            buf[ptr++] = encode(input[i] >> 2);
+            buf[ptr++] = encode(
+                    ((input[i] & 0x3) << 4)
+                            | ((input[i + 1] >> 4) & 0xF));
+            buf[ptr++] = encode(
+                    ((input[i + 1] & 0xF) << 2)
+                            | ((input[i + 2] >> 6) & 0x3));
+            buf[ptr++] = encode(input[i + 2] & 0x3F);
+        }
+        // encode when exactly 1 element (left) to encode
+        if (remaining == 1) {
+            buf[ptr++] = encode(input[i] >> 2);
+            buf[ptr++] = encode(((input[i]) & 0x3) << 4);
+            buf[ptr++] = '=';
+            buf[ptr++] = '=';
+        }
+        // encode when exactly 2 elements (left) to encode
+        if (remaining == 2) {
+            buf[ptr++] = encode(input[i] >> 2);
+            buf[ptr++] = encode(((input[i] & 0x3) << 4)
+                    | ((input[i + 1] >> 4) & 0xF));
+            buf[ptr++] = encode((input[i + 1] & 0xF) << 2);
+            buf[ptr++] = '=';
+        }
+        return ptr;
+    }
+
+    private static final char[] encodeMap = initEncodeMap();
+
+    private static char[] initEncodeMap() {
+        char[] map = new char[64];
+        int i;
+        for (i = 0; i < 26; i++) {
+            map[i] = (char) ('A' + i);
+        }
+        for (i = 26; i < 52; i++) {
+            map[i] = (char) ('a' + (i - 26));
+        }
+        for (i = 52; i < 62; i++) {
+            map[i] = (char) ('0' + (i - 52));
+        }
+        map[62] = '+';
+        map[63] = '/';
+
+        return map;
+    }
+
+    public static char encode(int i) {
+        return encodeMap[i & 0x3F];
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexParser.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexParser.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexParser.java
new file mode 100644
index 0000000..ba7276b
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexParser.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.bytes;
+
+public class HexParser {
+    public static boolean isValidHexChar(char c) {
+        if (c >= '0' && c <= '9'
+                || c >= 'a' && c <= 'f'
+                || c >= 'A' && c <= 'F') {
+            return true;
+        }
+        return false;
+    }
+
+    public static int getValueFromValidHexChar(char c) {
+        if (c >= '0' && c <= '9') {
+            return c - '0';
+        }
+        if (c >= 'a' && c <= 'f') {
+            return 10 + c - 'a';
+        }
+        if (c >= 'A' && c <= 'F') {
+            return 10 + c - 'A';
+        }
+        throw new IllegalArgumentException("Invalid hex character : " + c);
+    }
+
+    private byte[] storage;
+    private int length;
+
+    public byte[] getByteArray() {
+        return storage;
+    }
+
+    public int getLength() {
+        return length;
+    }
+
+    public void generateByteArrayFromHexString(char[] input, int start, int length) {
+        if (length % 2 != 0) {
+            throw new IllegalArgumentException(
+                    "Invalid hex string for binary type: the string length should be a muliple of 2.");
+        }
+        this.length = length / 2;
+        ensureCapacity(this.length);
+        generateByteArrayFromHexString(input, start, length, storage, 0);
+    }
+
+    public void generateByteArrayFromHexString(byte[] input, int start, int length) {
+        if (length % 2 != 0) {
+            throw new IllegalArgumentException(
+                    "Invalid hex string for binary type: the string length should be a muliple of 2.");
+        }
+        this.length = length / 2;
+        ensureCapacity(this.length);
+        generateByteArrayFromHexString(input, start, length, storage, 0);
+    }
+
+    private void ensureCapacity(int capacity) {
+        if (storage == null || storage.length < capacity) {
+            storage = new byte[capacity];
+        }
+    }
+
+    public static void generateByteArrayFromHexString(char[] input, int start, int length, byte[] output,
+            int offset) {
+        for (int i = 0; i < length; i += 2) {
+            output[offset + i / 2] = (byte) ((getValueFromValidHexChar(input[start + i]) << 4) +
+                    getValueFromValidHexChar(input[start + i + 1]));
+        }
+    }
+
+    public static void generateByteArrayFromHexString(byte[] input, int start, int length, byte[] output,
+            int offset) {
+        for (int i = 0; i < length; i += 2) {
+            output[offset + i / 2] = (byte) ((getValueFromValidHexChar((char) input[start + i]) << 4) +
+                    getValueFromValidHexChar((char) input[start + i + 1]));
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexPrinter.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexPrinter.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexPrinter.java
new file mode 100644
index 0000000..5a9c064
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexPrinter.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.bytes;
+
+import java.io.IOException;
+
+public class HexPrinter {
+    public enum CASE {
+        LOWER_CASE,
+        UPPER_CASE,
+    }
+
+    public static byte hex(int i, CASE c) {
+        switch (c) {
+            case LOWER_CASE:
+                return (byte) (i < 10 ? i + '0' : i + ('a' - 10));
+            case UPPER_CASE:
+                return (byte) (i < 10 ? i + '0' : i + ('A' - 10));
+        }
+        return Byte.parseByte(null);
+    }
+
+    public static Appendable printHexString(byte[] bytes, int start, int length, Appendable appendable)
+            throws IOException {
+        for (int i = 0; i < length; ++i) {
+            appendable.append((char) hex((bytes[start + i] >>> 4) & 0x0f, CASE.UPPER_CASE));
+            appendable.append((char) hex((bytes[start + i] & 0x0f), CASE.UPPER_CASE));
+        }
+        return appendable;
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoder.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoder.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoder.java
new file mode 100644
index 0000000..5a716b4
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoder.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.encoding;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+/**
+ * Encodes positive integers in a variable-bytes format.
+ *
+ * Each byte stores seven bits of the number. The first bit of each byte notifies if it is the last byte.
+ * Specifically, if the first bit is set, then we need to shift the current value by seven and
+ * continue to read the next byte util we meet a byte whose first byte is unset.
+ *
+ * e.g. if the number is < 128, it will be stored using one byte and the byte value keeps as original.
+ * To store the number 255 (0xff) , it will be encoded as [0x81,0x7f]. To decode that value, it reads the 0x81
+ * to know that the current value is (0x81 & 0x7f)= 0x01, and the first bit tells that there are more bytes to
+ * be read. When it meets 0x7f, whose first flag is unset, it knows that it is the final byte to decode.
+ * Finally it will return ( 0x01 << 7) + 0x7f === 255.
+ *
+ */
+public class VarLenIntEncoderDecoder {
+    // sometimes the dec number is easier to get the sense of how big it is.
+    public static final int BOUND_ONE_BYTE = 128; // 1 << 7
+    public static final int BOUND_TWO_BYTE = 16384; // 1 << 14
+    public static final int BOUND_THREE_BYTE = 2097152; // 1 << 21
+    public static final int BOUND_FOUR_BYTE = 268435456; // 1 << 28
+    public static final int BOUND_FIVE_BYTE = Integer.MAX_VALUE;
+
+    public static final int ENCODE_MASK = 0x0000007F;
+    public static final byte CONTINUE_CHUNK = (byte) 0x80;
+    public static final byte DECODE_MASK = 0x7F;
+
+    // calculate the number of bytes needed for encoding
+    public static int getBytesRequired(int length) {
+        if (length < 0) {
+            throw new IllegalArgumentException("The length must be an non-negative value");
+        }
+
+        int byteCount = 0;
+        while (length > ENCODE_MASK) {
+            length = length >>> 7;
+            byteCount++;
+        }
+        return byteCount + 1;
+    }
+
+    public static int decode(DataInput in) throws IOException {
+        int sum = 0;
+        byte b = in.readByte();
+        while ((b & CONTINUE_CHUNK) == CONTINUE_CHUNK) {
+            sum = (sum + (b & DECODE_MASK)) << 7;
+            b = in.readByte();
+        }
+        sum += b;
+        return sum;
+    }
+
+    public static int decode(byte[] srcBytes, int startPos) {
+        int sum = 0;
+        while ((srcBytes[startPos] & CONTINUE_CHUNK) == CONTINUE_CHUNK) {
+            sum = (sum + (srcBytes[startPos] & DECODE_MASK)) << 7;
+            startPos++;
+        }
+        sum += srcBytes[startPos++];
+        return sum;
+    }
+
+    public static int encode(int lengthVal, byte[] destBytes, int startPos) {
+        if (lengthVal < 0) {
+            throw new IllegalArgumentException("The length must be an non-negative value");
+        }
+        int nextPos = startPos;
+        while (lengthVal > ENCODE_MASK) {
+            destBytes[nextPos++] = (byte) (lengthVal & ENCODE_MASK);
+            lengthVal = lengthVal >>> 7;
+        }
+        destBytes[nextPos++] = (byte) lengthVal;
+
+        // reverse order to optimize for decoding speed
+        int length = nextPos - startPos;
+        int i = 0;
+        for (; i < length / 2; i++) {
+            byte b = destBytes[startPos + i];
+            destBytes[startPos + i] = (byte) (destBytes[startPos + length - 1 - i] | CONTINUE_CHUNK);
+            destBytes[startPos + length - 1 - i] = (byte) (b | CONTINUE_CHUNK);
+        }
+        destBytes[startPos + i] |= CONTINUE_CHUNK;
+        destBytes[nextPos - 1] &= ENCODE_MASK;
+        return length;
+    }
+
+    public static VarLenIntDecoder createDecoder() {
+        return new VarLenIntDecoder();
+    }
+
+    // keep the stateful version for the ease of the continuously decoding behaviors.
+    public static class VarLenIntDecoder {
+
+        private byte[] bytes = null;
+        private int pos = 0;
+
+        public VarLenIntDecoder reset(byte[] bytes, int pos) {
+            this.bytes = bytes;
+            this.pos = pos;
+            return this;
+        }
+
+        /**
+         * @return the int value
+         */
+        public int decode() {
+            int sum = 0;
+            while ((bytes[pos] & CONTINUE_CHUNK) == CONTINUE_CHUNK) {
+                sum = (sum + (bytes[pos] & DECODE_MASK)) << 7;
+                pos++;
+            }
+            sum += bytes[pos++];
+            return sum;
+        }
+
+        public int getPos() {
+            return pos;
+        }
+
+    }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java
new file mode 100644
index 0000000..3cd0300
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import java.io.DataInput;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.Serializable;
+import java.io.UTFDataFormatException;
+
+import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+
+public class UTF8StringReader implements Serializable{
+
+    private byte[] bytearr = null;
+    private char[] chararr = null;
+
+    /**
+     * Reads from the
+     * stream <code>in</code> a representation
+     * of a Unicode  character string encoded in
+     * <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
+     * this string of characters is then returned as a <code>String</code>.
+     * The details of the modified UTF-8 representation
+     * are  exactly the same as for the <code>readUTF</code>
+     * method of <code>DataInput</code>.
+     *
+     * @param in a data input stream.
+     * @return a Unicode string.
+     * @throws EOFException           if the input stream reaches the end
+     *                                before all the bytes.
+     * @throws IOException            the stream has been closed and the contained
+     *                                input stream does not support reading after close, or
+     *                                another I/O error occurs.
+     * @throws UTFDataFormatException if the bytes do not represent a
+     *                                valid modified UTF-8 encoding of a Unicode string.
+     * @see java.io.DataInputStream#readUnsignedShort()
+     */
+    public final String readUTF(DataInput in) throws IOException {
+        int utflen = VarLenIntEncoderDecoder.decode(in);
+
+        if (bytearr == null || bytearr.length < utflen) {
+            bytearr = new byte[utflen * 2];
+            chararr = new char[utflen * 2];
+        }
+
+        int c, char2, char3;
+        int count = 0;
+        int chararr_count = 0;
+
+        in.readFully(bytearr, 0, utflen);
+
+        while (count < utflen) {
+            c = (int) bytearr[count] & 0xff;
+            if (c > 127)
+                break;
+            count++;
+            chararr[chararr_count++] = (char) c;
+        }
+
+        while (count < utflen) {
+            c = (int) bytearr[count] & 0xff;
+            switch (c >> 4) {
+                case 0:
+                case 1:
+                case 2:
+                case 3:
+                case 4:
+                case 5:
+                case 6:
+                case 7:
+                    /* 0xxxxxxx*/
+                    count++;
+                    chararr[chararr_count++] = (char) c;
+                    break;
+                case 12:
+                case 13:
+                    /* 110x xxxx   10xx xxxx*/
+                    count += 2;
+                    if (count > utflen)
+                        throw new UTFDataFormatException(
+                                "malformed input: partial character at end");
+                    char2 = (int) bytearr[count - 1];
+                    if ((char2 & 0xC0) != 0x80)
+                        throw new UTFDataFormatException(
+                                "malformed input around byte " + count);
+                    chararr[chararr_count++] = (char) (((c & 0x1F) << 6) |
+                            (char2 & 0x3F));
+                    break;
+                case 14:
+                    /* 1110 xxxx  10xx xxxx  10xx xxxx */
+                    count += 3;
+                    if (count > utflen)
+                        throw new UTFDataFormatException(
+                                "malformed input: partial character at end");
+                    char2 = (int) bytearr[count - 2];
+                    char3 = (int) bytearr[count - 1];
+                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+                        throw new UTFDataFormatException(
+                                "malformed input around byte " + (count - 1));
+                    chararr[chararr_count++] = (char) (((c & 0x0F) << 12) |
+                            ((char2 & 0x3F) << 6) |
+                            ((char3 & 0x3F) << 0));
+                    break;
+                default:
+                    /* 10xx xxxx,  1111 xxxx */
+                    throw new UTFDataFormatException(
+                            "malformed input around byte " + count);
+            }
+        }
+        // The number of chars produced may be less than utflen
+        return new String(chararr, 0, chararr_count);
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
new file mode 100644
index 0000000..7929691
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -0,0 +1,422 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+
+import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+
+/**
+ * A helper package to operate the UTF8String in Hyracks.
+ * Most of the codes were migrated from asterix-fuzzyjoin and hyracks-storage-am-invertedindex
+ */
+public class UTF8StringUtil {
+    public static char charAt(byte[] b, int s) {
+        if (s >= b.length) {
+            throw new ArrayIndexOutOfBoundsException("Are you crazy?");
+        }
+        int c = b[s] & 0xff;
+        switch (c >> 4) {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                return (char) c;
+
+            case 12:
+            case 13:
+                return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
+
+            case 14:
+                return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
+
+            default:
+                throw new IllegalArgumentException();
+        }
+    }
+
+    public static int charSize(byte[] b, int s) {
+        int c = b[s] & 0xff;
+        switch (c >> 4) {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+            case 6:
+            case 7:
+                return 1;
+
+            case 12:
+            case 13:
+                return 2;
+
+            case 14:
+                return 3;
+        }
+        throw new IllegalStateException();
+    }
+
+    public static int getModifiedUTF8Len(char c) {
+        if (c >= 0x0001 && c <= 0x007F) {
+            return 1;
+        } else if (c <= 0x07FF) {
+            return 2;
+        } else {
+            return 3;
+        }
+    }
+
+    public static int writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
+        if (c >= 0x0001 && c <= 0x007F) {
+            dos.writeByte(c);
+            return 1;
+        } else if (c <= 0x07FF) {
+            dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
+            dos.writeByte((byte) (0x80 | (c & 0x3F)));
+            return 2;
+        } else {
+            dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
+            dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
+            dos.writeByte((byte) (0x80 | (c & 0x3F)));
+            return 3;
+        }
+    }
+
+    public static int writeCharAsModifiedUTF8(char c, OutputStream dos) throws IOException {
+        if (c >= 0x0001 && c <= 0x007F) {
+            dos.write(c);
+            return 1;
+        } else if (c <= 0x07FF) {
+            dos.write((byte) (0xC0 | ((c >> 6) & 0x3F)));
+            dos.write((byte) (0x80 | (c & 0x3F)));
+            return 2;
+        } else {
+            dos.write((byte) (0xE0 | ((c >> 12) & 0x0F)));
+            dos.write((byte) (0x80 | ((c >> 6) & 0x3F)));
+            dos.write((byte) (0x80 | (c & 0x3F)));
+            return 3;
+        }
+    }
+
+    public static int getStringLength(byte[] b, int s) {
+        int len = getUTFLength(b, s);
+        int pos = s + getNumBytesToStoreLength(len);
+        int end = pos + len;
+        int charCount = 0;
+        while (pos < end) {
+            charCount++;
+            pos += charSize(b, pos);
+        }
+        return charCount;
+    }
+
+    public static int getUTFLength(byte[] b, int s) {
+        return VarLenIntEncoderDecoder.decode(b, s);
+    }
+
+    public static int getNumBytesToStoreLength(int strlen) {
+        return VarLenIntEncoderDecoder.getBytesRequired(strlen);
+    }
+
+    public static int UTF8ToCodePoint(byte[] b, int s) {
+        if (b[s] >> 7 == 0) {
+            // 1 byte
+            return b[s];
+        } else if ((b[s] & 0xe0) == 0xc0) { /*0xe0 = 0b1110000*/
+            // 2 bytes
+            return ((int) (b[s] & 0x1f)) << 6 | /*0x3f = 0b00111111*/
+                    ((int) (b[s + 1] & 0x3f));
+        } else if ((b[s] & 0xf0) == 0xe0) {
+            // 3bytes
+            return ((int) (b[s] & 0xf)) << 12 | ((int) (b[s + 1] & 0x3f)) << 6
+                    | ((int) (b[s + 2] & 0x3f));
+        } else if ((b[s] & 0xf8) == 0xf0) {
+            // 4bytes
+            return ((int) (b[s] & 0x7)) << 18 | ((int) (b[s + 1] & 0x3f)) << 12
+                    | ((int) (b[s + 2] & 0x3f)) << 6 | ((int) (b[s + 3] & 0x3f));
+        } else if ((b[s] & 0xfc) == 0xf8) {
+            // 5bytes
+            return ((int) (b[s] & 0x3)) << 24 | ((int) (b[s + 1] & 0x3f)) << 18
+                    | ((int) (b[s + 2] & 0x3f)) << 12 | ((int) (b[s + 3] & 0x3f)) << 6
+                    | ((int) (b[s + 4] & 0x3f));
+        } else if ((b[s] & 0xfe) == 0xfc) {
+            // 6bytes
+            return ((int) (b[s] & 0x1)) << 30 | ((int) (b[s + 1] & 0x3f)) << 24
+                    | ((int) (b[s + 2] & 0x3f)) << 18 | ((int) (b[s + 3] & 0x3f)) << 12
+                    | ((int) (b[s + 4] & 0x3f)) << 6 | ((int) (b[s + 5] & 0x3f));
+        }
+        return 0;
+    }
+
+    public static int codePointToUTF8(int c, byte[] outputUTF8) {
+        if (c < 0x80) {
+            outputUTF8[0] = (byte) (c & 0x7F /* mask 7 lsb: 0b1111111 */);
+            return 1;
+        } else if (c < 0x0800) {
+            outputUTF8[0] = (byte) (c >> 6 & 0x1F | 0xC0);
+            outputUTF8[1] = (byte) (c & 0x3F | 0x80);
+            return 2;
+        } else if (c < 0x010000) {
+            outputUTF8[0] = (byte) (c >> 12 & 0x0F | 0xE0);
+            outputUTF8[1] = (byte) (c >> 6 & 0x3F | 0x80);
+            outputUTF8[2] = (byte) (c & 0x3F | 0x80);
+            return 3;
+        } else if (c < 0x200000) {
+            outputUTF8[0] = (byte) (c >> 18 & 0x07 | 0xF0);
+            outputUTF8[1] = (byte) (c >> 12 & 0x3F | 0x80);
+            outputUTF8[2] = (byte) (c >> 6 & 0x3F | 0x80);
+            outputUTF8[3] = (byte) (c & 0x3F | 0x80);
+            return 4;
+        } else if (c < 0x4000000) {
+            outputUTF8[0] = (byte) (c >> 24 & 0x03 | 0xF8);
+            outputUTF8[1] = (byte) (c >> 18 & 0x3F | 0x80);
+            outputUTF8[2] = (byte) (c >> 12 & 0x3F | 0x80);
+            outputUTF8[3] = (byte) (c >> 6 & 0x3F | 0x80);
+            outputUTF8[4] = (byte) (c & 0x3F | 0x80);
+            return 5;
+        } else if (c < 0x80000000) {
+            outputUTF8[0] = (byte) (c >> 30 & 0x01 | 0xFC);
+            outputUTF8[1] = (byte) (c >> 24 & 0x3F | 0x80);
+            outputUTF8[2] = (byte) (c >> 18 & 0x3F | 0x80);
+            outputUTF8[3] = (byte) (c >> 12 & 0x3F | 0x80);
+            outputUTF8[4] = (byte) (c >> 6 & 0x3F | 0x80);
+            outputUTF8[5] = (byte) (c & 0x3F | 0x80);
+            return 6;
+        }
+        return 0;
+    }
+
+    /**
+     * Compute the normalized key of the UTF8 string.
+     * The normalized key in Hyracks is mainly used to speedup the comparison between pointable data.
+     * In the UTF8StringPTR case, we compute the integer value by using the first 2 chars.
+     * The comparator will first use this integer to get the result ( <,>, or =), it will check
+     * the actual bytes only if the normalized key is equal. Thus this normalized key must be
+     * consistent with the comparison result.
+     */
+    public static int normalize(byte[] bytes, int start) {
+        int len = getUTFLength(bytes, start);
+        long nk = 0;
+        int offset = start + getNumBytesToStoreLength(len);
+        for (int i = 0; i < 2; ++i) {
+            nk <<= 16;
+            if (i < len) {
+                nk += ((int) charAt(bytes, offset)) & 0xffff;
+                offset += charSize(bytes, offset);
+            }
+        }
+        return (int) (nk >> 1); // make it always positive.
+    }
+
+    public static int compareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+        return compareTo(thisBytes, thisStart, thatBytes, thatStart, false, false);
+    }
+
+    /**
+     * This function provides the raw bytes-based comparison for UTF8 strings.
+     * Note that the comparison may not deliver the correct ordering for certain languages that include 2 or 3 bytes characters.
+     * But it works for single-byte character languages.
+     */
+    public static int rawByteCompareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+        return compareTo(thisBytes, thisStart, thatBytes, thatStart, false, true);
+    }
+
+    public static int lowerCaseCompareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+        return compareTo(thisBytes, thisStart, thatBytes, thatStart, true, false);
+    }
+
+    public static int hash(byte[] bytes, int start, int coefficient, int r) {
+        return hash(bytes, start, false, false, coefficient, r);
+    }
+
+    public static int hash(byte[] bytes, int start) {
+        return hash(bytes, start, false, false, 31, Integer.MAX_VALUE);
+    }
+
+    /**
+     * This function provides the raw bytes-based hash function for UTF8 strings.
+     * Note that the hash values may not deliver the correct ordering for certain languages that include 2 or 3 bytes characters.
+     * But it works for single-byte character languages.
+     */
+    public static int rawBytehash(byte[] bytes, int start) {
+        return hash(bytes, start, false, true, 31, Integer.MAX_VALUE);
+    }
+
+    public static int lowerCaseHash(byte[] bytes, int start) {
+        return hash(bytes, start, true, false, 31, Integer.MAX_VALUE);
+    }
+
+    public static StringBuilder toString(StringBuilder builder, byte[] bytes, int start) {
+        int utfLen = getUTFLength(bytes, start);
+        int offset = getNumBytesToStoreLength(utfLen);
+        while (utfLen > 0) {
+            char c = charAt(bytes, start + offset);
+            builder.append(c);
+            int cLen = getModifiedUTF8Len(c);
+            offset += cLen;
+            utfLen -= cLen;
+        }
+        return builder;
+    }
+
+    public static void printUTF8StringWithQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
+        printUTF8String(b, s, l, os, true);
+    }
+
+    public static void printUTF8StringNoQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
+        printUTF8String(b, s, l, os, false);
+    }
+
+    public static void printUTF8StringWithQuotes(String str, OutputStream os) throws IOException {
+        printUTF8String(str, os, true);
+    }
+
+    public static void printUTF8StringNoQuotes(String str, OutputStream os) throws IOException {
+        printUTF8String(str, os, false);
+    }
+
+    public static int encodeUTF8Length(int length, byte[] bytes, int start) {
+        return VarLenIntEncoderDecoder.encode(length, bytes, start);
+    }
+
+    public static int writeUTF8Length(int length, byte[] bytes, DataOutput out) throws IOException {
+        int nbytes = encodeUTF8Length(length, bytes, 0);
+        out.write(bytes, 0, nbytes);
+        return nbytes;
+    }
+
+    private static void printUTF8String(byte[] b, int s, int l, OutputStream os, boolean useQuotes) throws IOException {
+        int stringLength = getUTFLength(b, s);
+        int position = s + getNumBytesToStoreLength(stringLength);
+        int maxPosition = position + stringLength;
+        if (useQuotes) {
+            os.write('\"');
+        }
+        while (position < maxPosition) {
+            char c = charAt(b, position);
+            switch (c) {
+                // escape
+                case '\\':
+                case '"':
+                    os.write('\\');
+                    break;
+            }
+            int sz = charSize(b, position);
+            while (sz > 0) {
+                os.write(b[position]);
+                position++;
+                sz--;
+            }
+        }
+        if (useQuotes) {
+            os.write('\"');
+        }
+    }
+
+    private static void printUTF8String(String string, OutputStream os, boolean useQuotes) throws IOException {
+        if (useQuotes) {
+            os.write('\"');
+        }
+        for (int i = 0; i < string.length(); i++) {
+            char ch = string.charAt(i);
+            writeCharAsModifiedUTF8(ch, os);
+        }
+        if (useQuotes) {
+            os.write('\"');
+        }
+    }
+
+    private static int compareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart,
+            boolean useLowerCase, boolean useRawByte) {
+        int utflen1 = getUTFLength(thisBytes, thisStart);
+        int utflen2 = getUTFLength(thatBytes, thatStart);
+
+        int c1 = 0;
+        int c2 = 0;
+
+        int s1Start = thisStart + getNumBytesToStoreLength(utflen1);
+        int s2Start = thatStart + getNumBytesToStoreLength(utflen2);
+
+        while (c1 < utflen1 && c2 < utflen2) {
+            char ch1, ch2;
+            if (useRawByte) {
+                ch1 = (char) thisBytes[s1Start + c1];
+                ch2 = (char) thatBytes[s2Start + c2];
+            } else {
+                ch1 = (charAt(thisBytes, s1Start + c1));
+                ch2 = (charAt(thatBytes, s2Start + c2));
+
+                if (useLowerCase) {
+                    ch1 = Character.toLowerCase(ch1);
+                    ch2 = Character.toLowerCase(ch2);
+                }
+            }
+
+            if (ch1 != ch2) {
+                return ch1 - ch2;
+            }
+            c1 += charSize(thisBytes, s1Start + c1);
+            c2 += charSize(thatBytes, s2Start + c2);
+        }
+        return utflen1 - utflen2;
+    }
+
+    private static int hash(byte[] bytes, int start, boolean useLowerCase, boolean useRawByte, int coefficient, int r) {
+        int h = 0;
+        int utflen = getUTFLength(bytes, start);
+        int sStart = start + getNumBytesToStoreLength(utflen);
+        int c = 0;
+
+        while (c < utflen) {
+            char ch;
+            if (useRawByte) {
+                ch = (char) bytes[sStart + c];
+            } else {
+                ch = charAt(bytes, sStart + c);
+                if (useLowerCase) {
+                    ch = Character.toLowerCase(ch);
+                }
+            }
+            h = (coefficient * h + ch) % r;
+            c += charSize(bytes, sStart + c);
+        }
+        return h;
+    }
+
+    public static byte[] writeStringToBytes(String string) {
+        UTF8StringWriter writer = new UTF8StringWriter();
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        DataOutputStream dos = new DataOutputStream(bos);
+        try {
+            writer.writeUTF8(string, dos);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+        return bos.toByteArray();
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java
new file mode 100644
index 0000000..021c02f
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.util.string;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+
+public class UTF8StringWriter implements Serializable{
+    private byte[] tempBytes;
+
+    public final void writeUTF8(CharSequence str, DataOutput out) throws IOException {
+        int strlen = str.length();
+        int utflen = 0;
+        char c;
+        int count = 0;
+
+        for (int i = 0; i < strlen; i++) {
+            c = str.charAt(i);
+            utflen += UTF8StringUtil.getModifiedUTF8Len(c);
+        }
+
+        ensureTempSize(utflen);
+
+        count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
+
+        int i = 0;
+        for (; i < strlen; i++) {
+            c = str.charAt(i);
+            if (!((c >= 0x0001) && (c <= 0x007F))) {
+                break;
+            }
+            tempBytes[count++] = (byte) c;
+        }
+
+        for (; i < strlen; i++) {
+            c = str.charAt(i);
+            count += writeToBytes(tempBytes, count, c);
+        }
+        out.write(tempBytes, 0, count);
+    }
+
+    public final void writeUTF8(char[] buffer, int start, int length, DataOutput out) throws IOException {
+        int utflen = 0;
+        int count = 0;
+        char c;
+
+        for (int i = 0; i < length; i++) {
+            c = buffer[i + start];
+            utflen += UTF8StringUtil.getModifiedUTF8Len(c);
+        }
+
+        ensureTempSize(utflen);
+
+        count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
+
+        int i = 0;
+        for (; i < length; i++) {
+            c = buffer[i + start];
+            if (!((c >= 0x0001) && (c <= 0x007F))) {
+                break;
+            }
+            tempBytes[count++] = (byte) c;
+        }
+
+        for (; i < length; i++) {
+            c = buffer[i + start];
+            count += writeToBytes(tempBytes, count, c);
+        }
+        out.write(tempBytes, 0, count);
+    }
+
+    private static int writeToBytes(byte[] tempBytes, int count, char c) {
+        int orig = count;
+        if ((c >= 0x0001) && (c <= 0x007F)) {
+            tempBytes[count++] = (byte) c;
+        } else if (c > 0x07FF) {
+            tempBytes[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
+            tempBytes[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
+            tempBytes[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
+        } else {
+            tempBytes[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
+            tempBytes[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
+        }
+        return count - orig;
+    }
+
+    private void ensureTempSize(int utflen) {
+        if (tempBytes == null || tempBytes.length < utflen + 5) {
+            tempBytes = new byte[utflen + 5];
+        }
+
+    }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoderTest.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoderTest.java b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoderTest.java
new file mode 100644
index 0000000..193dca6
--- /dev/null
+++ b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoderTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.encoding;
+
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_FIVE_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_FOUR_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_ONE_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_THREE_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_TWO_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.VarLenIntDecoder;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.createDecoder;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.decode;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.encode;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.getBytesRequired;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+
+import org.junit.Test;
+
+public class VarLenIntEncoderDecoderTest {
+
+    int[] bounds = new int[] { 0, BOUND_ONE_BYTE, BOUND_TWO_BYTE, BOUND_THREE_BYTE, BOUND_FOUR_BYTE, BOUND_FIVE_BYTE };
+
+    @Test
+    public void testGetBytesRequired() throws Exception {
+        for (int bound = 0; bound < bounds.length - 1; bound++) {
+            assertEquals(bound + 1, getBytesRequired(bounds[bound]));
+            assertEquals(bound + 1, getBytesRequired(bounds[bound + 1] - 1));
+        }
+    }
+
+    @Test
+    public void testEncodeDecode() throws Exception {
+        byte[] bytes = new byte[10];
+        int startPos = 3;
+        for (int i = 1; i < bounds.length - 1; i++) {
+            testEncodeDecode(i, bounds[i] - 1, bytes, startPos);
+            testEncodeDecode(i + 1, bounds[i], bytes, startPos);
+            testEncodeDecode(i + 1, bounds[i] + 1, bytes, startPos);
+        }
+        // Integer.Max
+        testEncodeDecode(5, BOUND_FIVE_BYTE, bytes, startPos);
+    }
+
+    @Test
+    public void testCreateDecoder() throws Exception {
+        VarLenIntDecoder decoder = createDecoder();
+        byte[] bytes = new byte[100];
+        int pos = 1;
+        for (int b : bounds) {
+            pos += encode(b, bytes, pos);
+        }
+        decoder.reset(bytes, 1);
+        for (int b : bounds) {
+            assertEquals(b, decoder.decode());
+        }
+    }
+
+    protected void testEncodeDecode(int expectedBytes, int value, byte[] bytes, int startPos) throws IOException {
+        assertEquals(expectedBytes, encode(value, bytes, startPos));
+        assertEquals(value, decode(bytes, startPos));
+
+        ByteArrayInputStream bis = new ByteArrayInputStream(bytes, startPos, bytes.length - startPos);
+        DataInputStream dis = new DataInputStream(bis);
+        assertEquals(value, decode(dis));
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringReaderWriterTest.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringReaderWriterTest.java b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringReaderWriterTest.java
new file mode 100644
index 0000000..bfc1fa8
--- /dev/null
+++ b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringReaderWriterTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import static org.apache.hyracks.util.string.UTF8StringSample.EMPTY_STRING;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_127;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_128;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_LARGE;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_LARGE_SUB_1;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM_SUB_1;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import org.junit.Test;
+
+public class UTF8StringReaderWriterTest {
+
+    UTF8StringWriter writer = new UTF8StringWriter();
+    UTF8StringReader reader = new UTF8StringReader();
+
+    @Test
+    public void testWriterReader() throws IOException {
+        writeAndReadOneString(EMPTY_STRING);
+        writeAndReadOneString(STRING_LEN_3);
+
+        writeAndReadOneString(STRING_LEN_127);
+        writeAndReadOneString(STRING_LEN_128);
+        writeAndReadOneString(STRING_LEN_MEDIUM_SUB_1);
+    }
+
+    @Test
+    public void testMedium() throws IOException {
+        writeAndReadOneString(STRING_LEN_MEDIUM);
+        writeAndReadOneString(STRING_LEN_LARGE_SUB_1);
+    }
+
+    @Test
+    public void testLarge() throws IOException {
+        writeAndReadOneString(STRING_LEN_LARGE);
+    }
+
+    @Test
+    public void testUTF8() throws IOException {
+        writeAndReadOneString(STRING_UTF8_3);
+        writeAndReadOneString(STRING_UTF8_MIX);
+    }
+
+    private void writeAndReadOneString(String testString) throws IOException {
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        DataOutputStream dos = new DataOutputStream(bos);
+        writer.writeUTF8(testString, dos);
+
+        ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray(), 0, bos.size());
+        assertEquals(testString, reader.readUTF(new DataInputStream(bis)));
+
+        int lastOffset = bos.size();
+        char[] charArray = testString.toCharArray();
+        writer.writeUTF8(charArray, 0, charArray.length, dos);
+
+        bis = new ByteArrayInputStream(bos.toByteArray(), lastOffset, bos.size());
+        assertEquals(testString, reader.readUTF(new DataInputStream(bis)));
+    }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
new file mode 100644
index 0000000..3e6e984
--- /dev/null
+++ b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_THREE_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_TWO_BYTE;
+
+import java.util.Arrays;
+
+/**
+ * Util class to provide the sample test string
+ */
+public class UTF8StringSample {
+    public static String EMPTY_STRING = "";
+
+    public static char ONE_ASCII_CHAR = 'x';
+    public static char ONE_UTF8_CHAR = 'à';
+
+    public static String STRING_LEN_3 = "xyz";
+    public static String STRING_UTF8_3 = "锟斤拷";
+    public static String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà"; // one, two, three, and four bytes
+    public static String STRING_UTF8_MIX_LOWERCASE = "\uD841\uDF0E\uD841\uDF31锟x斤y拷zà";
+
+    public static String STRING_LEN_127 = generateStringRepeatBy(ONE_ASCII_CHAR, 127);
+    public static String STRING_LEN_128 = generateStringRepeatBy(ONE_ASCII_CHAR, 128);
+
+    public static String STRING_LEN_MEDIUM_SUB_1 = generateStringRepeatBy(ONE_ASCII_CHAR, BOUND_TWO_BYTE - 1);
+    public static String STRING_LEN_MEDIUM = generateStringRepeatBy(ONE_ASCII_CHAR, BOUND_TWO_BYTE);
+
+    public static String STRING_LEN_LARGE_SUB_1 = generateStringRepeatBy(ONE_ASCII_CHAR, BOUND_THREE_BYTE - 1);
+    public static String STRING_LEN_LARGE = generateStringRepeatBy(ONE_ASCII_CHAR, BOUND_THREE_BYTE);
+
+    public static String generateStringRepeatBy(char c, int times) {
+        char[] chars = new char[times];
+        Arrays.fill(chars, c);
+        return new String(chars);
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
new file mode 100644
index 0000000..0e3ed5c
--- /dev/null
+++ b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import static org.apache.hyracks.util.string.UTF8StringUtil.writeStringToBytes;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_127;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_128;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
+import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
+import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
+import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getModifiedUTF8Len;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getNumBytesToStoreLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getStringLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getUTFLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseCompareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseHash;
+import static org.apache.hyracks.util.string.UTF8StringUtil.normalize;
+import static org.apache.hyracks.util.string.UTF8StringUtil.rawByteCompareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.hash;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+public class UTF8StringUtilTest {
+
+    @Test
+    public void testCharAtCharSizeGetLen() throws Exception {
+        char[] utf8Mix = STRING_UTF8_MIX.toCharArray();
+        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+        int pos = getNumBytesToStoreLength(getUTFLength(buffer, 0));
+        for (char c : utf8Mix) {
+            assertEquals(c, charAt(buffer, pos));
+            assertEquals(getModifiedUTF8Len(c), charSize(buffer, pos));
+            pos += charSize(buffer, pos);
+        }
+    }
+
+    @Test
+    public void testGetStringLength() throws Exception {
+        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+        assertEquals(STRING_UTF8_MIX.length(), getStringLength(buffer, 0));
+    }
+
+    @Test
+    public void testCompareToAndNormolize() throws Exception {
+        testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
+        testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
+        testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
+    }
+
+    public boolean isSameSign(int r1, int r2) {
+        if (r1 > 0) {
+            return r2 > 0;
+        }
+        if (r1 < 0) {
+            return r2 < 0;
+        }
+        return r2 == 0;
+    }
+
+    enum OPTION {STANDARD, RAW_BYTE, LOWERCASE}
+
+    public void testCompare(String str1, String str2, OPTION option) throws IOException {
+        byte[] buffer1 = writeStringToBytes(str1);
+        byte[] buffer2 = writeStringToBytes(str2);
+
+        switch (option) {
+            case STANDARD:
+                assertEquals(str1.compareTo(str2), compareTo(buffer1, 0, buffer2, 0));
+                int n1 = normalize(buffer1, 0);
+                int n2 = normalize(buffer2, 0);
+                assertTrue(isSameSign(str1.compareTo(str2), n1 - n2));
+                break;
+            case RAW_BYTE:
+                assertEquals(str1.compareTo(str2), rawByteCompareTo(buffer1, 0, buffer2, 0));
+                break;
+            case LOWERCASE:
+                assertEquals(str1.compareToIgnoreCase(str2), lowerCaseCompareTo(buffer1, 0, buffer2, 0));
+                break;
+        }
+
+    }
+
+    @Test
+    public void testRawByteCompareTo() throws Exception {
+        testCompare(STRING_LEN_MEDIUM, STRING_LEN_MEDIUM, OPTION.RAW_BYTE);
+        testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.RAW_BYTE);
+    }
+
+    @Test
+    public void testLowerCaseCompareTo() throws Exception {
+        testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.LOWERCASE);
+        testCompare(STRING_LEN_127, STRING_UTF8_MIX, OPTION.LOWERCASE);
+        testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX_LOWERCASE, OPTION.LOWERCASE);
+        testCompare(STRING_UTF8_MIX_LOWERCASE, STRING_UTF8_MIX, OPTION.LOWERCASE);
+    }
+
+    @Test
+    public void testToString() throws Exception {
+
+        StringBuilder sb = new StringBuilder();
+        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+        assertEquals(STRING_UTF8_MIX, UTF8StringUtil.toString(sb, buffer, 0).toString());
+    }
+
+    @Test
+    public void testHash() throws IOException {
+        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
+        int lowerHash = hash(buffer, 0);
+
+        buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
+        int upperHash = lowerCaseHash(buffer, 0);
+        assertEquals(lowerHash, upperHash);
+
+        int familyOne = hash(buffer, 0, 7, 297);
+        int familyTwo = hash(buffer, 0, 8, 297);
+        assertTrue(familyOne != familyTwo);
+    }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/pom.xml
----------------------------------------------------------------------
diff --git a/hyracks/pom.xml b/hyracks/pom.xml
index c1af7b9..61e06e4 100644
--- a/hyracks/pom.xml
+++ b/hyracks/pom.xml
@@ -96,6 +96,7 @@
   </pluginRepositories>
 
   <modules>
+    <module>hyracks-util</module>
     <module>hyracks-ipc</module>
     <module>hyracks-api</module>
     <module>hyracks-comm</module>