You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@asterixdb.apache.org by ji...@apache.org on 2015/10/29 01:25:04 UTC
[2/7] incubator-asterixdb-hyracks git commit: ASTERIXDB-1102: VarSize
Encoding to store length of String and ByteArray
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java
index f372dbe..6e764c3 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java
+++ b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/NGramTokenizerTest.java
@@ -20,21 +20,19 @@
package org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers;
import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
+import org.apache.hyracks.data.std.util.GrowableArray;
+import org.apache.hyracks.util.string.UTF8StringReader;
+import org.apache.hyracks.util.string.UTF8StringUtil;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
-import org.apache.hyracks.data.std.util.GrowableArray;
-
public class NGramTokenizerTest {
private char PRECHAR = '#';
@@ -72,11 +70,7 @@ public class NGramTokenizerTest {
@Before
public void init() throws Exception {
- // serialize string into bytes
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput dos = new DataOutputStream(baos);
- dos.writeUTF(str);
- inputBuffer = baos.toByteArray();
+ inputBuffer = UTF8StringUtil.writeStringToBytes(str);
}
void runTestNGramTokenizerWithCountedHashedUTF8Tokens(boolean prePost) throws IOException {
@@ -192,7 +186,8 @@ public class NGramTokenizerTest {
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
- String strGram = in.readUTF();
+ UTF8StringReader reader = new UTF8StringReader();
+ String strGram = reader.readUTF(in);
// System.out.println("\"" + strGram + "\"");
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java
index c42022e..78ba6a3 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java
+++ b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/tokenizers/WordTokenizerTest.java
@@ -20,21 +20,19 @@
package org.apache.hyracks.storage.am.lsm.invertedindex.tokenizers;
import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
-import junit.framework.Assert;
-
+import org.apache.hyracks.data.std.util.GrowableArray;
+import org.apache.hyracks.util.string.UTF8StringReader;
+import org.apache.hyracks.util.string.UTF8StringUtil;
import org.junit.Before;
import org.junit.Test;
-import org.apache.hyracks.data.std.util.GrowableArray;
+import junit.framework.Assert;
public class WordTokenizerTest {
@@ -46,7 +44,8 @@ public class WordTokenizerTest {
private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
private boolean isSeparator(char c) {
- return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
+ return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER
+ || Character.getType(c) == Character.OTHER_NUMBER);
}
private void tokenize(String text, ArrayList<String> tokens) {
@@ -78,10 +77,7 @@ public class WordTokenizerTest {
@Before
public void init() throws IOException {
// serialize text into bytes
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- DataOutput dos = new DataOutputStream(baos);
- dos.writeUTF(text);
- inputBuffer = baos.toByteArray();
+ inputBuffer = UTF8StringUtil.writeStringToBytes(text);
// init expected string tokens
tokenize(text, expectedUTF8Tokens);
@@ -144,7 +140,8 @@ public class WordTokenizerTest {
public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
- DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+ DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false,
+ tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
@@ -175,7 +172,8 @@ public class WordTokenizerTest {
public void testWordTokenizerWithUTF8Tokens() throws IOException {
UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
- DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
+ DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false,
+ tokenFactory);
tokenizer.reset(inputBuffer, 0, inputBuffer.length);
@@ -194,7 +192,8 @@ public class WordTokenizerTest {
ByteArrayInputStream bais = new ByteArrayInputStream(tokenData.getByteArray());
DataInput in = new DataInputStream(bais);
- String strToken = in.readUTF();
+ UTF8StringReader reader = new UTF8StringReader();
+ String strToken = reader.readUTF(in);
Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
index 36f615f..fd94870 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
+++ b/hyracks/hyracks-tests/hyracks-storage-am-lsm-invertedindex-test/src/test/java/org/apache/hyracks/storage/am/lsm/invertedindex/util/LSMInvertedIndexTestUtils.java
@@ -88,7 +88,7 @@ public class LSMInvertedIndexTestUtils {
fieldGens[0] = new DocumentStringFieldValueGenerator(2, 10, 10000, rnd);
fieldGens[1] = new SortedIntegerFieldValueGenerator(0);
ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
- UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
+ new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE };
TupleGenerator tupleGen = new TupleGenerator(fieldGens, fieldSerdes, 0);
return tupleGen;
}
@@ -98,7 +98,7 @@ public class LSMInvertedIndexTestUtils {
fieldGens[0] = new PersonNameFieldValueGenerator(rnd, 0.5f);
fieldGens[1] = new SortedIntegerFieldValueGenerator(0);
ISerializerDeserializer[] fieldSerdes = new ISerializerDeserializer[] {
- UTF8StringSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
+ new UTF8StringSerializerDeserializer(), IntegerSerializerDeserializer.INSTANCE };
TupleGenerator tupleGen = new TupleGenerator(fieldGens, fieldSerdes, 0);
return tupleGen;
}
@@ -110,7 +110,7 @@ public class LSMInvertedIndexTestUtils {
case INMEMORY:
case ONDISK:
case LSM: {
- fieldSerdes = new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE,
+ fieldSerdes = new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer(),
IntegerSerializerDeserializer.INSTANCE };
break;
}
@@ -118,7 +118,7 @@ public class LSMInvertedIndexTestUtils {
case PARTITIONED_ONDISK:
case PARTITIONED_LSM: {
// Such indexes also include the set-size for partitioning.
- fieldSerdes = new ISerializerDeserializer[] { UTF8StringSerializerDeserializer.INSTANCE,
+ fieldSerdes = new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer(),
ShortSerializerDeserializer.INSTANCE, IntegerSerializerDeserializer.INSTANCE };
break;
}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/pom.xml
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/pom.xml b/hyracks/hyracks-util/pom.xml
new file mode 100644
index 0000000..ca38040
--- /dev/null
+++ b/hyracks/hyracks-util/pom.xml
@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one
+ ~ or more contributor license agreements. See the NOTICE file
+ ~ distributed with this work for additional information
+ ~ regarding copyright ownership. The ASF licenses this file
+ ~ to you under the Apache License, Version 2.0 (the
+ ~ "License"); you may not use this file except in compliance
+ ~ with the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing,
+ ~ software distributed under the License is distributed on an
+ ~ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ ~ KIND, either express or implied. See the License for the
+ ~ specific language governing permissions and limitations
+ ~ under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <parent>
+ <artifactId>hyracks</artifactId>
+ <groupId>org.apache.hyracks</groupId>
+ <version>0.2.17-SNAPSHOT</version>
+ </parent>
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <version>2.6</version>
+ <executions>
+ <execution>
+ <goals>
+ <goal>test-jar</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+
+ <artifactId>hyracks-util</artifactId>
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ </dependency>
+ </dependencies>
+
+
+</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Parser.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Parser.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Parser.java
new file mode 100644
index 0000000..257daee
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Parser.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.bytes;
+
+import java.util.Arrays;
+
+public class Base64Parser {
+ private static final byte[] DECODE_MAP = initDecodeMap();
+ private static final byte PADDING = 127;
+
+ private static byte[] initDecodeMap() {
+ byte[] map = new byte[128];
+ Arrays.fill(map, (byte) -1);
+
+ int i;
+ for (i = 'A'; i <= 'Z'; i++) {
+ map[i] = (byte) (i - 'A');
+ }
+ for (i = 'a'; i <= 'z'; i++) {
+ map[i] = (byte) (i - 'a' + 26);
+ }
+ for (i = '0'; i <= '9'; i++) {
+ map[i] = (byte) (i - '0' + 52);
+ }
+ map['+'] = 62;
+ map['/'] = 63;
+ map['='] = PADDING;
+
+ return map;
+ }
+
+ private byte[] quadruplet = new byte[4];
+ private byte[] storage;
+ private int length = 0;
+
+ /**
+ * Parse the Base64 sequence from {@code input} into {@code out}
+ * Note, the out should have enough space by checking the {@link #guessLength(char[], int, int)} first
+ *
+ * @param input
+ * @param start
+ * @param length
+ * @param out
+ * @param offset
+ * @return
+ */
+ public int parseBase64String(char[] input, int start, int length, byte[] out, int offset) {
+ int outLength = 0;
+
+ int i;
+ int q = 0;
+
+ // convert each quadruplet to three bytes.
+ for (i = 0; i < length; i++) {
+ char ch = input[start + i];
+ byte v = DECODE_MAP[ch];
+
+ if (v == -1) {
+ throw new IllegalArgumentException("Invalid Base64 character");
+ }
+ quadruplet[q++] = v;
+
+ if (q == 4) {
+ outLength += dumpQuadruplet(out, offset + outLength);
+ q = 0;
+ }
+ }
+
+ return outLength;
+ }
+
+ /**
+ * Parse the Base64 sequence from {@code input} into {@code out}
+ * Note, the out should have enough space by checking the {@link #guessLength(byte[], int, int)} first
+ *
+ * @param input
+ * @param start
+ * @param length
+ * @param out
+ * @param offset
+ * @return the number of written bytes
+ */
+ public int parseBase64String(byte[] input, int start, int length, byte[] out, int offset) {
+ int outLength = 0;
+
+ int i;
+ int q = 0;
+
+ // convert each quadruplet to three bytes.
+ for (i = 0; i < length; i++) {
+ char ch = (char) input[start + i];
+ byte v = DECODE_MAP[ch];
+
+ if (v == -1) {
+ throw new IllegalArgumentException("Invalid Base64 character");
+ }
+ quadruplet[q++] = v;
+
+ if (q == 4) {
+ outLength += dumpQuadruplet(out, offset + outLength);
+ q = 0;
+ }
+ }
+
+ return outLength;
+ }
+
+ /**
+ * computes the length of binary data speculatively.
+ * Our requirement is to create byte[] of the exact length to store the binary data.
+ * If we do this in a straight-forward way, it takes two passes over the data.
+ * Experiments show that this is a non-trivial overhead (35% or so is spent on
+ * the first pass in calculating the length.)
+ * So the approach here is that we compute the length speculatively, without looking
+ * at the whole contents. The obtained speculative value is never less than the
+ * actual length of the binary data, but it may be bigger. So if the speculation
+ * goes wrong, we'll pay the cost of reallocation and buffer copying.
+ * If the base64 text is tightly packed with no indentation nor illegal char
+ * (like what most web services produce), then the speculation of this method
+ * will be correct, so we get the performance benefit.
+ */
+ public static int guessLength(char[] chars, int start, int length) {
+
+ // compute the tail '=' chars
+ int j = length - 1;
+ for (; j >= 0; j--) {
+ byte code = DECODE_MAP[chars[start + j]];
+ if (code == PADDING) {
+ continue;
+ }
+ if (code == -1) // most likely this base64 text is indented. go with the upper bound
+ {
+ return length / 4 * 3;
+ }
+ break;
+ }
+
+ j++; // text.charAt(j) is now at some base64 char, so +1 to make it the size
+ int padSize = length - j;
+ if (padSize > 2) // something is wrong with base64. be safe and go with the upper bound
+ {
+ return length / 4 * 3;
+ }
+
+ // so far this base64 looks like it's unindented tightly packed base64.
+ // take a chance and create an array with the expected size
+ return length / 4 * 3 - padSize;
+ }
+
+ public static int guessLength(byte[] chars, int start, int length) {
+
+ // compute the tail '=' chars
+ int j = length - 1;
+ for (; j >= 0; j--) {
+ byte code = DECODE_MAP[chars[start + j]];
+ if (code == PADDING) {
+ continue;
+ }
+ if (code == -1) // most likely this base64 text is indented. go with the upper bound
+ {
+ return length / 4 * 3;
+ }
+ break;
+ }
+
+ j++; // text.charAt(j) is now at some base64 char, so +1 to make it the size
+ int padSize = length - j;
+ if (padSize > 2) // something is wrong with base64. be safe and go with the upper bound
+ {
+ return length / 4 * 3;
+ }
+
+ // so far this base64 looks like it's unindented tightly packed base64.
+ // take a chance and create an array with the expected size
+ return length / 4 * 3 - padSize;
+ }
+
+ public byte[] getByteArray() {
+ return storage;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ /**
+ * Same as {@link #parseBase64String(byte[], int, int, byte[], int)}, but we will provide the storage for caller
+ *
+ * @param input
+ * @param start
+ * @param length
+ */
+ public void generatePureByteArrayFromBase64String(byte[] input, int start, int length) {
+ // The base64 character length equals to utf8length
+ if (length % 4 != 0) {
+ throw new IllegalArgumentException(
+ "Invalid Base64 string, the length of the string should be a multiple of 4");
+ }
+ final int buflen = guessLength(input, start, length);
+ ensureCapacity(buflen);
+ this.length = parseBase64String(input, start, length, storage, 0);
+ }
+
+ public void generatePureByteArrayFromBase64String(char[] input, int start, int length) {
+ if (length % 4 != 0) {
+ throw new IllegalArgumentException(
+ "Invalid Base64 string, the length of the string should be a multiple of 4");
+ }
+ final int buflen = guessLength(input, start, length);
+ ensureCapacity(buflen);
+ this.length = parseBase64String(input, start, length, storage, 0);
+ }
+
+ private void ensureCapacity(int length) {
+ if (storage == null || storage.length < length) {
+ storage = new byte[length];
+ }
+ }
+
+ private int dumpQuadruplet(byte[] out, int offset) {
+ int outLength = 0;
+ // quadruplet is now filled.
+ out[offset + outLength++] = (byte) ((quadruplet[0] << 2) | (quadruplet[1] >> 4));
+ if (quadruplet[2] != PADDING) {
+ out[offset + outLength++] = (byte) ((quadruplet[1] << 4) | (quadruplet[2] >> 2));
+ }
+ if (quadruplet[3] != PADDING) {
+ out[offset + outLength++] = (byte) ((quadruplet[2] << 6) | (quadruplet[3]));
+ }
+ return outLength;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Printer.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Printer.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Printer.java
new file mode 100644
index 0000000..0e1c078
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/Base64Printer.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.bytes;
+
+import java.io.IOException;
+
+public class Base64Printer {
+ /**
+ * Encodes a byte array into a {@code Appendable} stream by doing base64 encoding.
+ *
+ * @return the same input stream.
+ */
+ public static Appendable printBase64Binary(byte[] input, int offset, int len, Appendable appendable)
+ throws IOException {
+ // encode elements until only 1 or 2 elements are left to encode
+ int remaining = len;
+ int i;
+ for (i = offset; remaining >= 3; remaining -= 3, i += 3) {
+ appendable.append(encode(input[i] >> 2));
+ appendable.append(encode(
+ ((input[i] & 0x3) << 4)
+ | ((input[i + 1] >> 4) & 0xF)));
+ appendable.append(encode(
+ ((input[i + 1] & 0xF) << 2)
+ | ((input[i + 2] >> 6) & 0x3)));
+ appendable.append(encode(input[i + 2] & 0x3F));
+ }
+ // encode when exactly 1 element (left) to encode
+ if (remaining == 1) {
+ appendable.append(encode(input[i] >> 2));
+ appendable.append(encode(((input[i]) & 0x3) << 4));
+ appendable.append('=');
+ appendable.append('=');
+ }
+ // encode when exactly 2 elements (left) to encode
+ if (remaining == 2) {
+ appendable.append(encode(input[i] >> 2));
+ appendable.append(encode(((input[i] & 0x3) << 4)
+ | ((input[i + 1] >> 4) & 0xF)));
+ appendable.append(encode((input[i + 1] & 0xF) << 2));
+ appendable.append('=');
+ }
+ return appendable;
+ }
+
+ /**
+ * Encodes a byte array into a char array by doing base64 encoding.
+ * The caller must supply a big enough buffer.
+ *
+ * @return the value of {@code ptr+((len+2)/3)*4}, which is the new offset
+ * in the output buffer where the further bytes should be placed.
+ */
+ public static int printBase64Binary(byte[] input, int offset, int len, char[] buf, int ptr) {
+ // encode elements until only 1 or 2 elements are left to encode
+ int remaining = len;
+ int i;
+ for (i = offset; remaining >= 3; remaining -= 3, i += 3) {
+ buf[ptr++] = encode(input[i] >> 2);
+ buf[ptr++] = encode(
+ ((input[i] & 0x3) << 4)
+ | ((input[i + 1] >> 4) & 0xF));
+ buf[ptr++] = encode(
+ ((input[i + 1] & 0xF) << 2)
+ | ((input[i + 2] >> 6) & 0x3));
+ buf[ptr++] = encode(input[i + 2] & 0x3F);
+ }
+ // encode when exactly 1 element (left) to encode
+ if (remaining == 1) {
+ buf[ptr++] = encode(input[i] >> 2);
+ buf[ptr++] = encode(((input[i]) & 0x3) << 4);
+ buf[ptr++] = '=';
+ buf[ptr++] = '=';
+ }
+ // encode when exactly 2 elements (left) to encode
+ if (remaining == 2) {
+ buf[ptr++] = encode(input[i] >> 2);
+ buf[ptr++] = encode(((input[i] & 0x3) << 4)
+ | ((input[i + 1] >> 4) & 0xF));
+ buf[ptr++] = encode((input[i + 1] & 0xF) << 2);
+ buf[ptr++] = '=';
+ }
+ return ptr;
+ }
+
+ private static final char[] encodeMap = initEncodeMap();
+
+ private static char[] initEncodeMap() {
+ char[] map = new char[64];
+ int i;
+ for (i = 0; i < 26; i++) {
+ map[i] = (char) ('A' + i);
+ }
+ for (i = 26; i < 52; i++) {
+ map[i] = (char) ('a' + (i - 26));
+ }
+ for (i = 52; i < 62; i++) {
+ map[i] = (char) ('0' + (i - 52));
+ }
+ map[62] = '+';
+ map[63] = '/';
+
+ return map;
+ }
+
+ public static char encode(int i) {
+ return encodeMap[i & 0x3F];
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexParser.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexParser.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexParser.java
new file mode 100644
index 0000000..ba7276b
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexParser.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.bytes;
+
+public class HexParser {
+ public static boolean isValidHexChar(char c) {
+ if (c >= '0' && c <= '9'
+ || c >= 'a' && c <= 'f'
+ || c >= 'A' && c <= 'F') {
+ return true;
+ }
+ return false;
+ }
+
+ public static int getValueFromValidHexChar(char c) {
+ if (c >= '0' && c <= '9') {
+ return c - '0';
+ }
+ if (c >= 'a' && c <= 'f') {
+ return 10 + c - 'a';
+ }
+ if (c >= 'A' && c <= 'F') {
+ return 10 + c - 'A';
+ }
+ throw new IllegalArgumentException("Invalid hex character : " + c);
+ }
+
+ private byte[] storage;
+ private int length;
+
+ public byte[] getByteArray() {
+ return storage;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ public void generateByteArrayFromHexString(char[] input, int start, int length) {
+ if (length % 2 != 0) {
+ throw new IllegalArgumentException(
+ "Invalid hex string for binary type: the string length should be a muliple of 2.");
+ }
+ this.length = length / 2;
+ ensureCapacity(this.length);
+ generateByteArrayFromHexString(input, start, length, storage, 0);
+ }
+
+ public void generateByteArrayFromHexString(byte[] input, int start, int length) {
+ if (length % 2 != 0) {
+ throw new IllegalArgumentException(
+ "Invalid hex string for binary type: the string length should be a muliple of 2.");
+ }
+ this.length = length / 2;
+ ensureCapacity(this.length);
+ generateByteArrayFromHexString(input, start, length, storage, 0);
+ }
+
+ private void ensureCapacity(int capacity) {
+ if (storage == null || storage.length < capacity) {
+ storage = new byte[capacity];
+ }
+ }
+
+ public static void generateByteArrayFromHexString(char[] input, int start, int length, byte[] output,
+ int offset) {
+ for (int i = 0; i < length; i += 2) {
+ output[offset + i / 2] = (byte) ((getValueFromValidHexChar(input[start + i]) << 4) +
+ getValueFromValidHexChar(input[start + i + 1]));
+ }
+ }
+
+ public static void generateByteArrayFromHexString(byte[] input, int start, int length, byte[] output,
+ int offset) {
+ for (int i = 0; i < length; i += 2) {
+ output[offset + i / 2] = (byte) ((getValueFromValidHexChar((char) input[start + i]) << 4) +
+ getValueFromValidHexChar((char) input[start + i + 1]));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexPrinter.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexPrinter.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexPrinter.java
new file mode 100644
index 0000000..5a9c064
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/bytes/HexPrinter.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.bytes;
+
+import java.io.IOException;
+
+public class HexPrinter {
+ public enum CASE {
+ LOWER_CASE,
+ UPPER_CASE,
+ }
+
+ public static byte hex(int i, CASE c) {
+ switch (c) {
+ case LOWER_CASE:
+ return (byte) (i < 10 ? i + '0' : i + ('a' - 10));
+ case UPPER_CASE:
+ return (byte) (i < 10 ? i + '0' : i + ('A' - 10));
+ }
+ return Byte.parseByte(null);
+ }
+
+ public static Appendable printHexString(byte[] bytes, int start, int length, Appendable appendable)
+ throws IOException {
+ for (int i = 0; i < length; ++i) {
+ appendable.append((char) hex((bytes[start + i] >>> 4) & 0x0f, CASE.UPPER_CASE));
+ appendable.append((char) hex((bytes[start + i] & 0x0f), CASE.UPPER_CASE));
+ }
+ return appendable;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoder.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoder.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoder.java
new file mode 100644
index 0000000..5a716b4
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoder.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.encoding;
+
+import java.io.DataInput;
+import java.io.IOException;
+
+/**
+ * Encodes positive integers in a variable-bytes format.
+ *
+ * Each byte stores seven bits of the number. The first bit of each byte notifies if it is the last byte.
+ * Specifically, if the first bit is set, then we need to shift the current value by seven and
+ * continue to read the next byte util we meet a byte whose first byte is unset.
+ *
+ * e.g. if the number is < 128, it will be stored using one byte and the byte value keeps as original.
+ * To store the number 255 (0xff) , it will be encoded as [0x81,0x7f]. To decode that value, it reads the 0x81
+ * to know that the current value is (0x81 & 0x7f)= 0x01, and the first bit tells that there are more bytes to
+ * be read. When it meets 0x7f, whose first flag is unset, it knows that it is the final byte to decode.
+ * Finally it will return ( 0x01 << 7) + 0x7f === 255.
+ *
+ */
+public class VarLenIntEncoderDecoder {
+ // sometimes the dec number is easier to get the sense of how big it is.
+ public static final int BOUND_ONE_BYTE = 128; // 1 << 7
+ public static final int BOUND_TWO_BYTE = 16384; // 1 << 14
+ public static final int BOUND_THREE_BYTE = 2097152; // 1 << 21
+ public static final int BOUND_FOUR_BYTE = 268435456; // 1 << 28
+ public static final int BOUND_FIVE_BYTE = Integer.MAX_VALUE;
+
+ public static final int ENCODE_MASK = 0x0000007F;
+ public static final byte CONTINUE_CHUNK = (byte) 0x80;
+ public static final byte DECODE_MASK = 0x7F;
+
+ // calculate the number of bytes needed for encoding
+ public static int getBytesRequired(int length) {
+ if (length < 0) {
+ throw new IllegalArgumentException("The length must be an non-negative value");
+ }
+
+ int byteCount = 0;
+ while (length > ENCODE_MASK) {
+ length = length >>> 7;
+ byteCount++;
+ }
+ return byteCount + 1;
+ }
+
+ public static int decode(DataInput in) throws IOException {
+ int sum = 0;
+ byte b = in.readByte();
+ while ((b & CONTINUE_CHUNK) == CONTINUE_CHUNK) {
+ sum = (sum + (b & DECODE_MASK)) << 7;
+ b = in.readByte();
+ }
+ sum += b;
+ return sum;
+ }
+
+ public static int decode(byte[] srcBytes, int startPos) {
+ int sum = 0;
+ while ((srcBytes[startPos] & CONTINUE_CHUNK) == CONTINUE_CHUNK) {
+ sum = (sum + (srcBytes[startPos] & DECODE_MASK)) << 7;
+ startPos++;
+ }
+ sum += srcBytes[startPos++];
+ return sum;
+ }
+
+ public static int encode(int lengthVal, byte[] destBytes, int startPos) {
+ if (lengthVal < 0) {
+ throw new IllegalArgumentException("The length must be an non-negative value");
+ }
+ int nextPos = startPos;
+ while (lengthVal > ENCODE_MASK) {
+ destBytes[nextPos++] = (byte) (lengthVal & ENCODE_MASK);
+ lengthVal = lengthVal >>> 7;
+ }
+ destBytes[nextPos++] = (byte) lengthVal;
+
+ // reverse order to optimize for decoding speed
+ int length = nextPos - startPos;
+ int i = 0;
+ for (; i < length / 2; i++) {
+ byte b = destBytes[startPos + i];
+ destBytes[startPos + i] = (byte) (destBytes[startPos + length - 1 - i] | CONTINUE_CHUNK);
+ destBytes[startPos + length - 1 - i] = (byte) (b | CONTINUE_CHUNK);
+ }
+ destBytes[startPos + i] |= CONTINUE_CHUNK;
+ destBytes[nextPos - 1] &= ENCODE_MASK;
+ return length;
+ }
+
+ public static VarLenIntDecoder createDecoder() {
+ return new VarLenIntDecoder();
+ }
+
+ // keep the stateful version for the ease of the continuously decoding behaviors.
+ public static class VarLenIntDecoder {
+
+ private byte[] bytes = null;
+ private int pos = 0;
+
+ public VarLenIntDecoder reset(byte[] bytes, int pos) {
+ this.bytes = bytes;
+ this.pos = pos;
+ return this;
+ }
+
+ /**
+ * @return the int value
+ */
+ public int decode() {
+ int sum = 0;
+ while ((bytes[pos] & CONTINUE_CHUNK) == CONTINUE_CHUNK) {
+ sum = (sum + (bytes[pos] & DECODE_MASK)) << 7;
+ pos++;
+ }
+ sum += bytes[pos++];
+ return sum;
+ }
+
+ public int getPos() {
+ return pos;
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java
new file mode 100644
index 0000000..3cd0300
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringReader.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import java.io.DataInput;
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.Serializable;
+import java.io.UTFDataFormatException;
+
+import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+
+public class UTF8StringReader implements Serializable{
+
+ private byte[] bytearr = null;
+ private char[] chararr = null;
+
+ /**
+ * Reads from the
+ * stream <code>in</code> a representation
+ * of a Unicode character string encoded in
+ * <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
+ * this string of characters is then returned as a <code>String</code>.
+ * The details of the modified UTF-8 representation
+ * are exactly the same as for the <code>readUTF</code>
+ * method of <code>DataInput</code>.
+ *
+ * @param in a data input stream.
+ * @return a Unicode string.
+ * @throws EOFException if the input stream reaches the end
+ * before all the bytes.
+ * @throws IOException the stream has been closed and the contained
+ * input stream does not support reading after close, or
+ * another I/O error occurs.
+ * @throws UTFDataFormatException if the bytes do not represent a
+ * valid modified UTF-8 encoding of a Unicode string.
+ * @see java.io.DataInputStream#readUnsignedShort()
+ */
+ public final String readUTF(DataInput in) throws IOException {
+ int utflen = VarLenIntEncoderDecoder.decode(in);
+
+ if (bytearr == null || bytearr.length < utflen) {
+ bytearr = new byte[utflen * 2];
+ chararr = new char[utflen * 2];
+ }
+
+ int c, char2, char3;
+ int count = 0;
+ int chararr_count = 0;
+
+ in.readFully(bytearr, 0, utflen);
+
+ while (count < utflen) {
+ c = (int) bytearr[count] & 0xff;
+ if (c > 127)
+ break;
+ count++;
+ chararr[chararr_count++] = (char) c;
+ }
+
+ while (count < utflen) {
+ c = (int) bytearr[count] & 0xff;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ /* 0xxxxxxx*/
+ count++;
+ chararr[chararr_count++] = (char) c;
+ break;
+ case 12:
+ case 13:
+ /* 110x xxxx 10xx xxxx*/
+ count += 2;
+ if (count > utflen)
+ throw new UTFDataFormatException(
+ "malformed input: partial character at end");
+ char2 = (int) bytearr[count - 1];
+ if ((char2 & 0xC0) != 0x80)
+ throw new UTFDataFormatException(
+ "malformed input around byte " + count);
+ chararr[chararr_count++] = (char) (((c & 0x1F) << 6) |
+ (char2 & 0x3F));
+ break;
+ case 14:
+ /* 1110 xxxx 10xx xxxx 10xx xxxx */
+ count += 3;
+ if (count > utflen)
+ throw new UTFDataFormatException(
+ "malformed input: partial character at end");
+ char2 = (int) bytearr[count - 2];
+ char3 = (int) bytearr[count - 1];
+ if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
+ throw new UTFDataFormatException(
+ "malformed input around byte " + (count - 1));
+ chararr[chararr_count++] = (char) (((c & 0x0F) << 12) |
+ ((char2 & 0x3F) << 6) |
+ ((char3 & 0x3F) << 0));
+ break;
+ default:
+ /* 10xx xxxx, 1111 xxxx */
+ throw new UTFDataFormatException(
+ "malformed input around byte " + count);
+ }
+ }
+ // The number of chars produced may be less than utflen
+ return new String(chararr, 0, chararr_count);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
new file mode 100644
index 0000000..7929691
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -0,0 +1,422 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataOutput;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+
+import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+
+/**
+ * A helper package to operate the UTF8String in Hyracks.
+ * Most of the codes were migrated from asterix-fuzzyjoin and hyracks-storage-am-invertedindex
+ */
+public class UTF8StringUtil {
+ public static char charAt(byte[] b, int s) {
+ if (s >= b.length) {
+ throw new ArrayIndexOutOfBoundsException("Are you crazy?");
+ }
+ int c = b[s] & 0xff;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ return (char) c;
+
+ case 12:
+ case 13:
+ return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
+
+ case 14:
+ return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) | (((b[s + 2]) & 0x3F) << 0));
+
+ default:
+ throw new IllegalArgumentException();
+ }
+ }
+
+ public static int charSize(byte[] b, int s) {
+ int c = b[s] & 0xff;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
+ return 1;
+
+ case 12:
+ case 13:
+ return 2;
+
+ case 14:
+ return 3;
+ }
+ throw new IllegalStateException();
+ }
+
+ public static int getModifiedUTF8Len(char c) {
+ if (c >= 0x0001 && c <= 0x007F) {
+ return 1;
+ } else if (c <= 0x07FF) {
+ return 2;
+ } else {
+ return 3;
+ }
+ }
+
+ public static int writeCharAsModifiedUTF8(char c, DataOutput dos) throws IOException {
+ if (c >= 0x0001 && c <= 0x007F) {
+ dos.writeByte(c);
+ return 1;
+ } else if (c <= 0x07FF) {
+ dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
+ dos.writeByte((byte) (0x80 | (c & 0x3F)));
+ return 2;
+ } else {
+ dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
+ dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
+ dos.writeByte((byte) (0x80 | (c & 0x3F)));
+ return 3;
+ }
+ }
+
+ public static int writeCharAsModifiedUTF8(char c, OutputStream dos) throws IOException {
+ if (c >= 0x0001 && c <= 0x007F) {
+ dos.write(c);
+ return 1;
+ } else if (c <= 0x07FF) {
+ dos.write((byte) (0xC0 | ((c >> 6) & 0x3F)));
+ dos.write((byte) (0x80 | (c & 0x3F)));
+ return 2;
+ } else {
+ dos.write((byte) (0xE0 | ((c >> 12) & 0x0F)));
+ dos.write((byte) (0x80 | ((c >> 6) & 0x3F)));
+ dos.write((byte) (0x80 | (c & 0x3F)));
+ return 3;
+ }
+ }
+
+ public static int getStringLength(byte[] b, int s) {
+ int len = getUTFLength(b, s);
+ int pos = s + getNumBytesToStoreLength(len);
+ int end = pos + len;
+ int charCount = 0;
+ while (pos < end) {
+ charCount++;
+ pos += charSize(b, pos);
+ }
+ return charCount;
+ }
+
+ public static int getUTFLength(byte[] b, int s) {
+ return VarLenIntEncoderDecoder.decode(b, s);
+ }
+
+ public static int getNumBytesToStoreLength(int strlen) {
+ return VarLenIntEncoderDecoder.getBytesRequired(strlen);
+ }
+
+ public static int UTF8ToCodePoint(byte[] b, int s) {
+ if (b[s] >> 7 == 0) {
+ // 1 byte
+ return b[s];
+ } else if ((b[s] & 0xe0) == 0xc0) { /*0xe0 = 0b1110000*/
+ // 2 bytes
+ return ((int) (b[s] & 0x1f)) << 6 | /*0x3f = 0b00111111*/
+ ((int) (b[s + 1] & 0x3f));
+ } else if ((b[s] & 0xf0) == 0xe0) {
+ // 3bytes
+ return ((int) (b[s] & 0xf)) << 12 | ((int) (b[s + 1] & 0x3f)) << 6
+ | ((int) (b[s + 2] & 0x3f));
+ } else if ((b[s] & 0xf8) == 0xf0) {
+ // 4bytes
+ return ((int) (b[s] & 0x7)) << 18 | ((int) (b[s + 1] & 0x3f)) << 12
+ | ((int) (b[s + 2] & 0x3f)) << 6 | ((int) (b[s + 3] & 0x3f));
+ } else if ((b[s] & 0xfc) == 0xf8) {
+ // 5bytes
+ return ((int) (b[s] & 0x3)) << 24 | ((int) (b[s + 1] & 0x3f)) << 18
+ | ((int) (b[s + 2] & 0x3f)) << 12 | ((int) (b[s + 3] & 0x3f)) << 6
+ | ((int) (b[s + 4] & 0x3f));
+ } else if ((b[s] & 0xfe) == 0xfc) {
+ // 6bytes
+ return ((int) (b[s] & 0x1)) << 30 | ((int) (b[s + 1] & 0x3f)) << 24
+ | ((int) (b[s + 2] & 0x3f)) << 18 | ((int) (b[s + 3] & 0x3f)) << 12
+ | ((int) (b[s + 4] & 0x3f)) << 6 | ((int) (b[s + 5] & 0x3f));
+ }
+ return 0;
+ }
+
+ public static int codePointToUTF8(int c, byte[] outputUTF8) {
+ if (c < 0x80) {
+ outputUTF8[0] = (byte) (c & 0x7F /* mask 7 lsb: 0b1111111 */);
+ return 1;
+ } else if (c < 0x0800) {
+ outputUTF8[0] = (byte) (c >> 6 & 0x1F | 0xC0);
+ outputUTF8[1] = (byte) (c & 0x3F | 0x80);
+ return 2;
+ } else if (c < 0x010000) {
+ outputUTF8[0] = (byte) (c >> 12 & 0x0F | 0xE0);
+ outputUTF8[1] = (byte) (c >> 6 & 0x3F | 0x80);
+ outputUTF8[2] = (byte) (c & 0x3F | 0x80);
+ return 3;
+ } else if (c < 0x200000) {
+ outputUTF8[0] = (byte) (c >> 18 & 0x07 | 0xF0);
+ outputUTF8[1] = (byte) (c >> 12 & 0x3F | 0x80);
+ outputUTF8[2] = (byte) (c >> 6 & 0x3F | 0x80);
+ outputUTF8[3] = (byte) (c & 0x3F | 0x80);
+ return 4;
+ } else if (c < 0x4000000) {
+ outputUTF8[0] = (byte) (c >> 24 & 0x03 | 0xF8);
+ outputUTF8[1] = (byte) (c >> 18 & 0x3F | 0x80);
+ outputUTF8[2] = (byte) (c >> 12 & 0x3F | 0x80);
+ outputUTF8[3] = (byte) (c >> 6 & 0x3F | 0x80);
+ outputUTF8[4] = (byte) (c & 0x3F | 0x80);
+ return 5;
+ } else if (c < 0x80000000) {
+ outputUTF8[0] = (byte) (c >> 30 & 0x01 | 0xFC);
+ outputUTF8[1] = (byte) (c >> 24 & 0x3F | 0x80);
+ outputUTF8[2] = (byte) (c >> 18 & 0x3F | 0x80);
+ outputUTF8[3] = (byte) (c >> 12 & 0x3F | 0x80);
+ outputUTF8[4] = (byte) (c >> 6 & 0x3F | 0x80);
+ outputUTF8[5] = (byte) (c & 0x3F | 0x80);
+ return 6;
+ }
+ return 0;
+ }
+
+ /**
+ * Compute the normalized key of the UTF8 string.
+ * The normalized key in Hyracks is mainly used to speedup the comparison between pointable data.
+ * In the UTF8StringPTR case, we compute the integer value by using the first 2 chars.
+ * The comparator will first use this integer to get the result ( <,>, or =), it will check
+ * the actual bytes only if the normalized key is equal. Thus this normalized key must be
+ * consistent with the comparison result.
+ */
+ public static int normalize(byte[] bytes, int start) {
+ int len = getUTFLength(bytes, start);
+ long nk = 0;
+ int offset = start + getNumBytesToStoreLength(len);
+ for (int i = 0; i < 2; ++i) {
+ nk <<= 16;
+ if (i < len) {
+ nk += ((int) charAt(bytes, offset)) & 0xffff;
+ offset += charSize(bytes, offset);
+ }
+ }
+ return (int) (nk >> 1); // make it always positive.
+ }
+
+ public static int compareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+ return compareTo(thisBytes, thisStart, thatBytes, thatStart, false, false);
+ }
+
+ /**
+ * This function provides the raw bytes-based comparison for UTF8 strings.
+ * Note that the comparison may not deliver the correct ordering for certain languages that include 2 or 3 bytes characters.
+ * But it works for single-byte character languages.
+ */
+ public static int rawByteCompareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+ return compareTo(thisBytes, thisStart, thatBytes, thatStart, false, true);
+ }
+
+ public static int lowerCaseCompareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart) {
+ return compareTo(thisBytes, thisStart, thatBytes, thatStart, true, false);
+ }
+
+ public static int hash(byte[] bytes, int start, int coefficient, int r) {
+ return hash(bytes, start, false, false, coefficient, r);
+ }
+
+ public static int hash(byte[] bytes, int start) {
+ return hash(bytes, start, false, false, 31, Integer.MAX_VALUE);
+ }
+
+ /**
+ * This function provides the raw bytes-based hash function for UTF8 strings.
+ * Note that the hash values may not deliver the correct ordering for certain languages that include 2 or 3 bytes characters.
+ * But it works for single-byte character languages.
+ */
+ public static int rawBytehash(byte[] bytes, int start) {
+ return hash(bytes, start, false, true, 31, Integer.MAX_VALUE);
+ }
+
+ public static int lowerCaseHash(byte[] bytes, int start) {
+ return hash(bytes, start, true, false, 31, Integer.MAX_VALUE);
+ }
+
+ public static StringBuilder toString(StringBuilder builder, byte[] bytes, int start) {
+ int utfLen = getUTFLength(bytes, start);
+ int offset = getNumBytesToStoreLength(utfLen);
+ while (utfLen > 0) {
+ char c = charAt(bytes, start + offset);
+ builder.append(c);
+ int cLen = getModifiedUTF8Len(c);
+ offset += cLen;
+ utfLen -= cLen;
+ }
+ return builder;
+ }
+
+ public static void printUTF8StringWithQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
+ printUTF8String(b, s, l, os, true);
+ }
+
+ public static void printUTF8StringNoQuotes(byte[] b, int s, int l, OutputStream os) throws IOException {
+ printUTF8String(b, s, l, os, false);
+ }
+
+ public static void printUTF8StringWithQuotes(String str, OutputStream os) throws IOException {
+ printUTF8String(str, os, true);
+ }
+
+ public static void printUTF8StringNoQuotes(String str, OutputStream os) throws IOException {
+ printUTF8String(str, os, false);
+ }
+
+ public static int encodeUTF8Length(int length, byte[] bytes, int start) {
+ return VarLenIntEncoderDecoder.encode(length, bytes, start);
+ }
+
+ public static int writeUTF8Length(int length, byte[] bytes, DataOutput out) throws IOException {
+ int nbytes = encodeUTF8Length(length, bytes, 0);
+ out.write(bytes, 0, nbytes);
+ return nbytes;
+ }
+
+ private static void printUTF8String(byte[] b, int s, int l, OutputStream os, boolean useQuotes) throws IOException {
+ int stringLength = getUTFLength(b, s);
+ int position = s + getNumBytesToStoreLength(stringLength);
+ int maxPosition = position + stringLength;
+ if (useQuotes) {
+ os.write('\"');
+ }
+ while (position < maxPosition) {
+ char c = charAt(b, position);
+ switch (c) {
+ // escape
+ case '\\':
+ case '"':
+ os.write('\\');
+ break;
+ }
+ int sz = charSize(b, position);
+ while (sz > 0) {
+ os.write(b[position]);
+ position++;
+ sz--;
+ }
+ }
+ if (useQuotes) {
+ os.write('\"');
+ }
+ }
+
+ private static void printUTF8String(String string, OutputStream os, boolean useQuotes) throws IOException {
+ if (useQuotes) {
+ os.write('\"');
+ }
+ for (int i = 0; i < string.length(); i++) {
+ char ch = string.charAt(i);
+ writeCharAsModifiedUTF8(ch, os);
+ }
+ if (useQuotes) {
+ os.write('\"');
+ }
+ }
+
+ private static int compareTo(byte[] thisBytes, int thisStart, byte[] thatBytes, int thatStart,
+ boolean useLowerCase, boolean useRawByte) {
+ int utflen1 = getUTFLength(thisBytes, thisStart);
+ int utflen2 = getUTFLength(thatBytes, thatStart);
+
+ int c1 = 0;
+ int c2 = 0;
+
+ int s1Start = thisStart + getNumBytesToStoreLength(utflen1);
+ int s2Start = thatStart + getNumBytesToStoreLength(utflen2);
+
+ while (c1 < utflen1 && c2 < utflen2) {
+ char ch1, ch2;
+ if (useRawByte) {
+ ch1 = (char) thisBytes[s1Start + c1];
+ ch2 = (char) thatBytes[s2Start + c2];
+ } else {
+ ch1 = (charAt(thisBytes, s1Start + c1));
+ ch2 = (charAt(thatBytes, s2Start + c2));
+
+ if (useLowerCase) {
+ ch1 = Character.toLowerCase(ch1);
+ ch2 = Character.toLowerCase(ch2);
+ }
+ }
+
+ if (ch1 != ch2) {
+ return ch1 - ch2;
+ }
+ c1 += charSize(thisBytes, s1Start + c1);
+ c2 += charSize(thatBytes, s2Start + c2);
+ }
+ return utflen1 - utflen2;
+ }
+
+ private static int hash(byte[] bytes, int start, boolean useLowerCase, boolean useRawByte, int coefficient, int r) {
+ int h = 0;
+ int utflen = getUTFLength(bytes, start);
+ int sStart = start + getNumBytesToStoreLength(utflen);
+ int c = 0;
+
+ while (c < utflen) {
+ char ch;
+ if (useRawByte) {
+ ch = (char) bytes[sStart + c];
+ } else {
+ ch = charAt(bytes, sStart + c);
+ if (useLowerCase) {
+ ch = Character.toLowerCase(ch);
+ }
+ }
+ h = (coefficient * h + ch) % r;
+ c += charSize(bytes, sStart + c);
+ }
+ return h;
+ }
+
+ public static byte[] writeStringToBytes(String string) {
+ UTF8StringWriter writer = new UTF8StringWriter();
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ DataOutputStream dos = new DataOutputStream(bos);
+ try {
+ writer.writeUTF8(string, dos);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return bos.toByteArray();
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java
new file mode 100644
index 0000000..021c02f
--- /dev/null
+++ b/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringWriter.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.util.string;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
+
+public class UTF8StringWriter implements Serializable{
+ private byte[] tempBytes;
+
+ public final void writeUTF8(CharSequence str, DataOutput out) throws IOException {
+ int strlen = str.length();
+ int utflen = 0;
+ char c;
+ int count = 0;
+
+ for (int i = 0; i < strlen; i++) {
+ c = str.charAt(i);
+ utflen += UTF8StringUtil.getModifiedUTF8Len(c);
+ }
+
+ ensureTempSize(utflen);
+
+ count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
+
+ int i = 0;
+ for (; i < strlen; i++) {
+ c = str.charAt(i);
+ if (!((c >= 0x0001) && (c <= 0x007F))) {
+ break;
+ }
+ tempBytes[count++] = (byte) c;
+ }
+
+ for (; i < strlen; i++) {
+ c = str.charAt(i);
+ count += writeToBytes(tempBytes, count, c);
+ }
+ out.write(tempBytes, 0, count);
+ }
+
+ public final void writeUTF8(char[] buffer, int start, int length, DataOutput out) throws IOException {
+ int utflen = 0;
+ int count = 0;
+ char c;
+
+ for (int i = 0; i < length; i++) {
+ c = buffer[i + start];
+ utflen += UTF8StringUtil.getModifiedUTF8Len(c);
+ }
+
+ ensureTempSize(utflen);
+
+ count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
+
+ int i = 0;
+ for (; i < length; i++) {
+ c = buffer[i + start];
+ if (!((c >= 0x0001) && (c <= 0x007F))) {
+ break;
+ }
+ tempBytes[count++] = (byte) c;
+ }
+
+ for (; i < length; i++) {
+ c = buffer[i + start];
+ count += writeToBytes(tempBytes, count, c);
+ }
+ out.write(tempBytes, 0, count);
+ }
+
+ private static int writeToBytes(byte[] tempBytes, int count, char c) {
+ int orig = count;
+ if ((c >= 0x0001) && (c <= 0x007F)) {
+ tempBytes[count++] = (byte) c;
+ } else if (c > 0x07FF) {
+ tempBytes[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
+ tempBytes[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
+ tempBytes[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
+ } else {
+ tempBytes[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
+ tempBytes[count++] = (byte) (0x80 | ((c >> 0) & 0x3F));
+ }
+ return count - orig;
+ }
+
+ private void ensureTempSize(int utflen) {
+ if (tempBytes == null || tempBytes.length < utflen + 5) {
+ tempBytes = new byte[utflen + 5];
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoderTest.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoderTest.java b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoderTest.java
new file mode 100644
index 0000000..193dca6
--- /dev/null
+++ b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/encoding/VarLenIntEncoderDecoderTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.encoding;
+
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_FIVE_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_FOUR_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_ONE_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_THREE_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_TWO_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.VarLenIntDecoder;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.createDecoder;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.decode;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.encode;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.getBytesRequired;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.IOException;
+
+import org.junit.Test;
+
+public class VarLenIntEncoderDecoderTest {
+
+ int[] bounds = new int[] { 0, BOUND_ONE_BYTE, BOUND_TWO_BYTE, BOUND_THREE_BYTE, BOUND_FOUR_BYTE, BOUND_FIVE_BYTE };
+
+ @Test
+ public void testGetBytesRequired() throws Exception {
+ for (int bound = 0; bound < bounds.length - 1; bound++) {
+ assertEquals(bound + 1, getBytesRequired(bounds[bound]));
+ assertEquals(bound + 1, getBytesRequired(bounds[bound + 1] - 1));
+ }
+ }
+
+ @Test
+ public void testEncodeDecode() throws Exception {
+ byte[] bytes = new byte[10];
+ int startPos = 3;
+ for (int i = 1; i < bounds.length - 1; i++) {
+ testEncodeDecode(i, bounds[i] - 1, bytes, startPos);
+ testEncodeDecode(i + 1, bounds[i], bytes, startPos);
+ testEncodeDecode(i + 1, bounds[i] + 1, bytes, startPos);
+ }
+ // Integer.Max
+ testEncodeDecode(5, BOUND_FIVE_BYTE, bytes, startPos);
+ }
+
+ @Test
+ public void testCreateDecoder() throws Exception {
+ VarLenIntDecoder decoder = createDecoder();
+ byte[] bytes = new byte[100];
+ int pos = 1;
+ for (int b : bounds) {
+ pos += encode(b, bytes, pos);
+ }
+ decoder.reset(bytes, 1);
+ for (int b : bounds) {
+ assertEquals(b, decoder.decode());
+ }
+ }
+
+ protected void testEncodeDecode(int expectedBytes, int value, byte[] bytes, int startPos) throws IOException {
+ assertEquals(expectedBytes, encode(value, bytes, startPos));
+ assertEquals(value, decode(bytes, startPos));
+
+ ByteArrayInputStream bis = new ByteArrayInputStream(bytes, startPos, bytes.length - startPos);
+ DataInputStream dis = new DataInputStream(bis);
+ assertEquals(value, decode(dis));
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringReaderWriterTest.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringReaderWriterTest.java b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringReaderWriterTest.java
new file mode 100644
index 0000000..bfc1fa8
--- /dev/null
+++ b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringReaderWriterTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import static org.apache.hyracks.util.string.UTF8StringSample.EMPTY_STRING;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_127;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_128;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_LARGE;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_LARGE_SUB_1;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM_SUB_1;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+
+import org.junit.Test;
+
+public class UTF8StringReaderWriterTest {
+
+ UTF8StringWriter writer = new UTF8StringWriter();
+ UTF8StringReader reader = new UTF8StringReader();
+
+ @Test
+ public void testWriterReader() throws IOException {
+ writeAndReadOneString(EMPTY_STRING);
+ writeAndReadOneString(STRING_LEN_3);
+
+ writeAndReadOneString(STRING_LEN_127);
+ writeAndReadOneString(STRING_LEN_128);
+ writeAndReadOneString(STRING_LEN_MEDIUM_SUB_1);
+ }
+
+ @Test
+ public void testMedium() throws IOException {
+ writeAndReadOneString(STRING_LEN_MEDIUM);
+ writeAndReadOneString(STRING_LEN_LARGE_SUB_1);
+ }
+
+ @Test
+ public void testLarge() throws IOException {
+ writeAndReadOneString(STRING_LEN_LARGE);
+ }
+
+ @Test
+ public void testUTF8() throws IOException {
+ writeAndReadOneString(STRING_UTF8_3);
+ writeAndReadOneString(STRING_UTF8_MIX);
+ }
+
+ private void writeAndReadOneString(String testString) throws IOException {
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ DataOutputStream dos = new DataOutputStream(bos);
+ writer.writeUTF8(testString, dos);
+
+ ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray(), 0, bos.size());
+ assertEquals(testString, reader.readUTF(new DataInputStream(bis)));
+
+ int lastOffset = bos.size();
+ char[] charArray = testString.toCharArray();
+ writer.writeUTF8(charArray, 0, charArray.length, dos);
+
+ bis = new ByteArrayInputStream(bos.toByteArray(), lastOffset, bos.size());
+ assertEquals(testString, reader.readUTF(new DataInputStream(bis)));
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
new file mode 100644
index 0000000..3e6e984
--- /dev/null
+++ b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_THREE_BYTE;
+import static org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder.BOUND_TWO_BYTE;
+
+import java.util.Arrays;
+
+/**
+ * Util class to provide the sample test string
+ */
+public class UTF8StringSample {
+ public static String EMPTY_STRING = "";
+
+ public static char ONE_ASCII_CHAR = 'x';
+ public static char ONE_UTF8_CHAR = 'à';
+
+ public static String STRING_LEN_3 = "xyz";
+ public static String STRING_UTF8_3 = "锟斤拷";
+ public static String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà"; // one, two, three, and four bytes
+ public static String STRING_UTF8_MIX_LOWERCASE = "\uD841\uDF0E\uD841\uDF31锟x斤y拷zà";
+
+ public static String STRING_LEN_127 = generateStringRepeatBy(ONE_ASCII_CHAR, 127);
+ public static String STRING_LEN_128 = generateStringRepeatBy(ONE_ASCII_CHAR, 128);
+
+ public static String STRING_LEN_MEDIUM_SUB_1 = generateStringRepeatBy(ONE_ASCII_CHAR, BOUND_TWO_BYTE - 1);
+ public static String STRING_LEN_MEDIUM = generateStringRepeatBy(ONE_ASCII_CHAR, BOUND_TWO_BYTE);
+
+ public static String STRING_LEN_LARGE_SUB_1 = generateStringRepeatBy(ONE_ASCII_CHAR, BOUND_THREE_BYTE - 1);
+ public static String STRING_LEN_LARGE = generateStringRepeatBy(ONE_ASCII_CHAR, BOUND_THREE_BYTE);
+
+ public static String generateStringRepeatBy(char c, int times) {
+ char[] chars = new char[times];
+ Arrays.fill(chars, c);
+ return new String(chars);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
----------------------------------------------------------------------
diff --git a/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
new file mode 100644
index 0000000..0e3ed5c
--- /dev/null
+++ b/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.hyracks.util.string;
+
+import static org.apache.hyracks.util.string.UTF8StringUtil.writeStringToBytes;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_127;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_128;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
+import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
+import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
+import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
+import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getModifiedUTF8Len;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getNumBytesToStoreLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getStringLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.getUTFLength;
+import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseCompareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseHash;
+import static org.apache.hyracks.util.string.UTF8StringUtil.normalize;
+import static org.apache.hyracks.util.string.UTF8StringUtil.rawByteCompareTo;
+import static org.apache.hyracks.util.string.UTF8StringUtil.hash;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+public class UTF8StringUtilTest {
+
+ @Test
+ public void testCharAtCharSizeGetLen() throws Exception {
+ char[] utf8Mix = STRING_UTF8_MIX.toCharArray();
+ byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+ int pos = getNumBytesToStoreLength(getUTFLength(buffer, 0));
+ for (char c : utf8Mix) {
+ assertEquals(c, charAt(buffer, pos));
+ assertEquals(getModifiedUTF8Len(c), charSize(buffer, pos));
+ pos += charSize(buffer, pos);
+ }
+ }
+
+ @Test
+ public void testGetStringLength() throws Exception {
+ byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+ assertEquals(STRING_UTF8_MIX.length(), getStringLength(buffer, 0));
+ }
+
+ @Test
+ public void testCompareToAndNormolize() throws Exception {
+ testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
+ testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
+ testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
+ }
+
+ public boolean isSameSign(int r1, int r2) {
+ if (r1 > 0) {
+ return r2 > 0;
+ }
+ if (r1 < 0) {
+ return r2 < 0;
+ }
+ return r2 == 0;
+ }
+
+ enum OPTION {STANDARD, RAW_BYTE, LOWERCASE}
+
+ public void testCompare(String str1, String str2, OPTION option) throws IOException {
+ byte[] buffer1 = writeStringToBytes(str1);
+ byte[] buffer2 = writeStringToBytes(str2);
+
+ switch (option) {
+ case STANDARD:
+ assertEquals(str1.compareTo(str2), compareTo(buffer1, 0, buffer2, 0));
+ int n1 = normalize(buffer1, 0);
+ int n2 = normalize(buffer2, 0);
+ assertTrue(isSameSign(str1.compareTo(str2), n1 - n2));
+ break;
+ case RAW_BYTE:
+ assertEquals(str1.compareTo(str2), rawByteCompareTo(buffer1, 0, buffer2, 0));
+ break;
+ case LOWERCASE:
+ assertEquals(str1.compareToIgnoreCase(str2), lowerCaseCompareTo(buffer1, 0, buffer2, 0));
+ break;
+ }
+
+ }
+
+ @Test
+ public void testRawByteCompareTo() throws Exception {
+ testCompare(STRING_LEN_MEDIUM, STRING_LEN_MEDIUM, OPTION.RAW_BYTE);
+ testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.RAW_BYTE);
+ }
+
+ @Test
+ public void testLowerCaseCompareTo() throws Exception {
+ testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.LOWERCASE);
+ testCompare(STRING_LEN_127, STRING_UTF8_MIX, OPTION.LOWERCASE);
+ testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX_LOWERCASE, OPTION.LOWERCASE);
+ testCompare(STRING_UTF8_MIX_LOWERCASE, STRING_UTF8_MIX, OPTION.LOWERCASE);
+ }
+
+ @Test
+ public void testToString() throws Exception {
+
+ StringBuilder sb = new StringBuilder();
+ byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
+ assertEquals(STRING_UTF8_MIX, UTF8StringUtil.toString(sb, buffer, 0).toString());
+ }
+
+ @Test
+ public void testHash() throws IOException {
+ byte[] buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
+ int lowerHash = hash(buffer, 0);
+
+ buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
+ int upperHash = lowerCaseHash(buffer, 0);
+ assertEquals(lowerHash, upperHash);
+
+ int familyOne = hash(buffer, 0, 7, 297);
+ int familyTwo = hash(buffer, 0, 8, 297);
+ assertTrue(familyOne != familyTwo);
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-asterixdb-hyracks/blob/26c3b536/hyracks/pom.xml
----------------------------------------------------------------------
diff --git a/hyracks/pom.xml b/hyracks/pom.xml
index c1af7b9..61e06e4 100644
--- a/hyracks/pom.xml
+++ b/hyracks/pom.xml
@@ -96,6 +96,7 @@
</pluginRepositories>
<modules>
+ <module>hyracks-util</module>
<module>hyracks-ipc</module>
<module>hyracks-api</module>
<module>hyracks-comm</module>