You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2016/03/02 00:39:17 UTC
[3/4] spark git commit: [SPARK-13548][BUILD] Move tags and unsafe
modules into common
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
----------------------------------------------------------------------
diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
new file mode 100644
index 0000000..87706d0
--- /dev/null
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -0,0 +1,1023 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.types;
+
+import javax.annotation.Nonnull;
+import java.io.*;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.Arrays;
+import java.util.Map;
+
+import com.esotericsoftware.kryo.Kryo;
+import com.esotericsoftware.kryo.KryoSerializable;
+import com.esotericsoftware.kryo.io.Input;
+import com.esotericsoftware.kryo.io.Output;
+
+import org.apache.spark.unsafe.Platform;
+import org.apache.spark.unsafe.array.ByteArrayMethods;
+import org.apache.spark.unsafe.hash.Murmur3_x86_32;
+
+import static org.apache.spark.unsafe.Platform.*;
+
+
+/**
+ * A UTF-8 String for internal Spark use.
+ * <p>
+ * A String encoded in UTF-8 as an Array[Byte], which can be used for comparison,
+ * search, see http://en.wikipedia.org/wiki/UTF-8 for details.
+ * <p>
+ * Note: This is not designed for general use cases, should not be used outside SQL.
+ */
+public final class UTF8String implements Comparable<UTF8String>, Externalizable, KryoSerializable {
+
+ // These are only updated by readExternal() or read()
+ @Nonnull
+ private Object base;
+ private long offset;
+ private int numBytes;
+
+ public Object getBaseObject() { return base; }
+ public long getBaseOffset() { return offset; }
+
+ private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4,
+ 5, 5, 5, 5,
+ 6, 6};
+
+ private static boolean isLittleEndian = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN;
+
+ private static final UTF8String COMMA_UTF8 = UTF8String.fromString(",");
+ public static final UTF8String EMPTY_UTF8 = UTF8String.fromString("");
+
+ /**
+ * Creates an UTF8String from byte array, which should be encoded in UTF-8.
+ *
+ * Note: `bytes` will be hold by returned UTF8String.
+ */
+ public static UTF8String fromBytes(byte[] bytes) {
+ if (bytes != null) {
+ return new UTF8String(bytes, BYTE_ARRAY_OFFSET, bytes.length);
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Creates an UTF8String from byte array, which should be encoded in UTF-8.
+ *
+ * Note: `bytes` will be hold by returned UTF8String.
+ */
+ public static UTF8String fromBytes(byte[] bytes, int offset, int numBytes) {
+ if (bytes != null) {
+ return new UTF8String(bytes, BYTE_ARRAY_OFFSET + offset, numBytes);
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * Creates an UTF8String from given address (base and offset) and length.
+ */
+ public static UTF8String fromAddress(Object base, long offset, int numBytes) {
+ return new UTF8String(base, offset, numBytes);
+ }
+
+ /**
+ * Creates an UTF8String from String.
+ */
+ public static UTF8String fromString(String str) {
+ if (str == null) return null;
+ try {
+ return fromBytes(str.getBytes("utf-8"));
+ } catch (UnsupportedEncodingException e) {
+ // Turn the exception into unchecked so we can find out about it at runtime, but
+ // don't need to add lots of boilerplate code everywhere.
+ throwException(e);
+ return null;
+ }
+ }
+
+ /**
+ * Creates an UTF8String that contains `length` spaces.
+ */
+ public static UTF8String blankString(int length) {
+ byte[] spaces = new byte[length];
+ Arrays.fill(spaces, (byte) ' ');
+ return fromBytes(spaces);
+ }
+
+ protected UTF8String(Object base, long offset, int numBytes) {
+ this.base = base;
+ this.offset = offset;
+ this.numBytes = numBytes;
+ }
+
+ // for serialization
+ public UTF8String() {
+ this(null, 0, 0);
+ }
+
+ /**
+ * Writes the content of this string into a memory address, identified by an object and an offset.
+ * The target memory address must already been allocated, and have enough space to hold all the
+ * bytes in this string.
+ */
+ public void writeToMemory(Object target, long targetOffset) {
+ Platform.copyMemory(base, offset, target, targetOffset, numBytes);
+ }
+
+ public void writeTo(ByteBuffer buffer) {
+ assert(buffer.hasArray());
+ byte[] target = buffer.array();
+ int offset = buffer.arrayOffset();
+ int pos = buffer.position();
+ writeToMemory(target, Platform.BYTE_ARRAY_OFFSET + offset + pos);
+ buffer.position(pos + numBytes);
+ }
+
+ /**
+ * Returns the number of bytes for a code point with the first byte as `b`
+ * @param b The first byte of a code point
+ */
+ private static int numBytesForFirstByte(final byte b) {
+ final int offset = (b & 0xFF) - 192;
+ return (offset >= 0) ? bytesOfCodePointInUTF8[offset] : 1;
+ }
+
+ /**
+ * Returns the number of bytes
+ */
+ public int numBytes() {
+ return numBytes;
+ }
+
+ /**
+ * Returns the number of code points in it.
+ */
+ public int numChars() {
+ int len = 0;
+ for (int i = 0; i < numBytes; i += numBytesForFirstByte(getByte(i))) {
+ len += 1;
+ }
+ return len;
+ }
+
+ /**
+ * Returns a 64-bit integer that can be used as the prefix used in sorting.
+ */
+ public long getPrefix() {
+ // Since JVMs are either 4-byte aligned or 8-byte aligned, we check the size of the string.
+ // If size is 0, just return 0.
+ // If size is between 0 and 4 (inclusive), assume data is 4-byte aligned under the hood and
+ // use a getInt to fetch the prefix.
+ // If size is greater than 4, assume we have at least 8 bytes of data to fetch.
+ // After getting the data, we use a mask to mask out data that is not part of the string.
+ long p;
+ long mask = 0;
+ if (isLittleEndian) {
+ if (numBytes >= 8) {
+ p = Platform.getLong(base, offset);
+ } else if (numBytes > 4) {
+ p = Platform.getLong(base, offset);
+ mask = (1L << (8 - numBytes) * 8) - 1;
+ } else if (numBytes > 0) {
+ p = (long) Platform.getInt(base, offset);
+ mask = (1L << (8 - numBytes) * 8) - 1;
+ } else {
+ p = 0;
+ }
+ p = java.lang.Long.reverseBytes(p);
+ } else {
+ // byteOrder == ByteOrder.BIG_ENDIAN
+ if (numBytes >= 8) {
+ p = Platform.getLong(base, offset);
+ } else if (numBytes > 4) {
+ p = Platform.getLong(base, offset);
+ mask = (1L << (8 - numBytes) * 8) - 1;
+ } else if (numBytes > 0) {
+ p = ((long) Platform.getInt(base, offset)) << 32;
+ mask = (1L << (8 - numBytes) * 8) - 1;
+ } else {
+ p = 0;
+ }
+ }
+ p &= ~mask;
+ return p;
+ }
+
+ /**
+ * Returns the underline bytes, will be a copy of it if it's part of another array.
+ */
+ public byte[] getBytes() {
+ // avoid copy if `base` is `byte[]`
+ if (offset == BYTE_ARRAY_OFFSET && base instanceof byte[]
+ && ((byte[]) base).length == numBytes) {
+ return (byte[]) base;
+ } else {
+ byte[] bytes = new byte[numBytes];
+ copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, numBytes);
+ return bytes;
+ }
+ }
+
+ /**
+ * Returns a substring of this.
+ * @param start the position of first code point
+ * @param until the position after last code point, exclusive.
+ */
+ public UTF8String substring(final int start, final int until) {
+ if (until <= start || start >= numBytes) {
+ return EMPTY_UTF8;
+ }
+
+ int i = 0;
+ int c = 0;
+ while (i < numBytes && c < start) {
+ i += numBytesForFirstByte(getByte(i));
+ c += 1;
+ }
+
+ int j = i;
+ while (i < numBytes && c < until) {
+ i += numBytesForFirstByte(getByte(i));
+ c += 1;
+ }
+
+ if (i > j) {
+ byte[] bytes = new byte[i - j];
+ copyMemory(base, offset + j, bytes, BYTE_ARRAY_OFFSET, i - j);
+ return fromBytes(bytes);
+ } else {
+ return EMPTY_UTF8;
+ }
+ }
+
+ public UTF8String substringSQL(int pos, int length) {
+ // Information regarding the pos calculation:
+ // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
+ // negative indices for start positions. If a start index i is greater than 0, it
+ // refers to element i-1 in the sequence. If a start index i is less than 0, it refers
+ // to the -ith element before the end of the sequence. If a start index i is 0, it
+ // refers to the first element.
+ int len = numChars();
+ int start = (pos > 0) ? pos -1 : ((pos < 0) ? len + pos : 0);
+ int end = (length == Integer.MAX_VALUE) ? len : start + length;
+ return substring(start, end);
+ }
+
+ /**
+ * Returns whether this contains `substring` or not.
+ */
+ public boolean contains(final UTF8String substring) {
+ if (substring.numBytes == 0) {
+ return true;
+ }
+
+ byte first = substring.getByte(0);
+ for (int i = 0; i <= numBytes - substring.numBytes; i++) {
+ if (getByte(i) == first && matchAt(substring, i)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Returns the byte at position `i`.
+ */
+ private byte getByte(int i) {
+ return Platform.getByte(base, offset + i);
+ }
+
+ private boolean matchAt(final UTF8String s, int pos) {
+ if (s.numBytes + pos > numBytes || pos < 0) {
+ return false;
+ }
+ return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes);
+ }
+
+ public boolean startsWith(final UTF8String prefix) {
+ return matchAt(prefix, 0);
+ }
+
+ public boolean endsWith(final UTF8String suffix) {
+ return matchAt(suffix, numBytes - suffix.numBytes);
+ }
+
+ /**
+ * Returns the upper case of this string
+ */
+ public UTF8String toUpperCase() {
+ if (numBytes == 0) {
+ return EMPTY_UTF8;
+ }
+
+ byte[] bytes = new byte[numBytes];
+ bytes[0] = (byte) Character.toTitleCase(getByte(0));
+ for (int i = 0; i < numBytes; i++) {
+ byte b = getByte(i);
+ if (numBytesForFirstByte(b) != 1) {
+ // fallback
+ return toUpperCaseSlow();
+ }
+ int upper = Character.toUpperCase((int) b);
+ if (upper > 127) {
+ // fallback
+ return toUpperCaseSlow();
+ }
+ bytes[i] = (byte) upper;
+ }
+ return fromBytes(bytes);
+ }
+
+ private UTF8String toUpperCaseSlow() {
+ return fromString(toString().toUpperCase());
+ }
+
+ /**
+ * Returns the lower case of this string
+ */
+ public UTF8String toLowerCase() {
+ if (numBytes == 0) {
+ return EMPTY_UTF8;
+ }
+
+ byte[] bytes = new byte[numBytes];
+ bytes[0] = (byte) Character.toTitleCase(getByte(0));
+ for (int i = 0; i < numBytes; i++) {
+ byte b = getByte(i);
+ if (numBytesForFirstByte(b) != 1) {
+ // fallback
+ return toLowerCaseSlow();
+ }
+ int lower = Character.toLowerCase((int) b);
+ if (lower > 127) {
+ // fallback
+ return toLowerCaseSlow();
+ }
+ bytes[i] = (byte) lower;
+ }
+ return fromBytes(bytes);
+ }
+
+ private UTF8String toLowerCaseSlow() {
+ return fromString(toString().toLowerCase());
+ }
+
+ /**
+ * Returns the title case of this string, that could be used as title.
+ */
+ public UTF8String toTitleCase() {
+ if (numBytes == 0) {
+ return EMPTY_UTF8;
+ }
+
+ byte[] bytes = new byte[numBytes];
+ for (int i = 0; i < numBytes; i++) {
+ byte b = getByte(i);
+ if (i == 0 || getByte(i - 1) == ' ') {
+ if (numBytesForFirstByte(b) != 1) {
+ // fallback
+ return toTitleCaseSlow();
+ }
+ int upper = Character.toTitleCase(b);
+ if (upper > 127) {
+ // fallback
+ return toTitleCaseSlow();
+ }
+ bytes[i] = (byte) upper;
+ } else {
+ bytes[i] = b;
+ }
+ }
+ return fromBytes(bytes);
+ }
+
+ private UTF8String toTitleCaseSlow() {
+ StringBuffer sb = new StringBuffer();
+ String s = toString();
+ sb.append(s);
+ sb.setCharAt(0, Character.toTitleCase(sb.charAt(0)));
+ for (int i = 1; i < s.length(); i++) {
+ if (sb.charAt(i - 1) == ' ') {
+ sb.setCharAt(i, Character.toTitleCase(sb.charAt(i)));
+ }
+ }
+ return fromString(sb.toString());
+ }
+
+ /*
+ * Returns the index of the string `match` in this String. This string has to be a comma separated
+ * list. If `match` contains a comma 0 will be returned. If the `match` isn't part of this String,
+ * 0 will be returned, else the index of match (1-based index)
+ */
+ public int findInSet(UTF8String match) {
+ if (match.contains(COMMA_UTF8)) {
+ return 0;
+ }
+
+ int n = 1, lastComma = -1;
+ for (int i = 0; i < numBytes; i++) {
+ if (getByte(i) == (byte) ',') {
+ if (i - (lastComma + 1) == match.numBytes &&
+ ByteArrayMethods.arrayEquals(base, offset + (lastComma + 1), match.base, match.offset,
+ match.numBytes)) {
+ return n;
+ }
+ lastComma = i;
+ n++;
+ }
+ }
+ if (numBytes - (lastComma + 1) == match.numBytes &&
+ ByteArrayMethods.arrayEquals(base, offset + (lastComma + 1), match.base, match.offset,
+ match.numBytes)) {
+ return n;
+ }
+ return 0;
+ }
+
+ /**
+ * Copy the bytes from the current UTF8String, and make a new UTF8String.
+ * @param start the start position of the current UTF8String in bytes.
+ * @param end the end position of the current UTF8String in bytes.
+ * @return a new UTF8String in the position of [start, end] of current UTF8String bytes.
+ */
+ private UTF8String copyUTF8String(int start, int end) {
+ int len = end - start + 1;
+ byte[] newBytes = new byte[len];
+ copyMemory(base, offset + start, newBytes, BYTE_ARRAY_OFFSET, len);
+ return UTF8String.fromBytes(newBytes);
+ }
+
+ public UTF8String trim() {
+ int s = 0;
+ int e = this.numBytes - 1;
+ // skip all of the space (0x20) in the left side
+ while (s < this.numBytes && getByte(s) <= 0x20 && getByte(s) >= 0x00) s++;
+ // skip all of the space (0x20) in the right side
+ while (e >= 0 && getByte(e) <= 0x20 && getByte(e) >= 0x00) e--;
+ if (s > e) {
+ // empty string
+ return UTF8String.fromBytes(new byte[0]);
+ } else {
+ return copyUTF8String(s, e);
+ }
+ }
+
+ public UTF8String trimLeft() {
+ int s = 0;
+ // skip all of the space (0x20) in the left side
+ while (s < this.numBytes && getByte(s) <= 0x20 && getByte(s) >= 0x00) s++;
+ if (s == this.numBytes) {
+ // empty string
+ return UTF8String.fromBytes(new byte[0]);
+ } else {
+ return copyUTF8String(s, this.numBytes - 1);
+ }
+ }
+
+ public UTF8String trimRight() {
+ int e = numBytes - 1;
+ // skip all of the space (0x20) in the right side
+ while (e >= 0 && getByte(e) <= 0x20 && getByte(e) >= 0x00) e--;
+
+ if (e < 0) {
+ // empty string
+ return UTF8String.fromBytes(new byte[0]);
+ } else {
+ return copyUTF8String(0, e);
+ }
+ }
+
+ public UTF8String reverse() {
+ byte[] result = new byte[this.numBytes];
+
+ int i = 0; // position in byte
+ while (i < numBytes) {
+ int len = numBytesForFirstByte(getByte(i));
+ copyMemory(this.base, this.offset + i, result,
+ BYTE_ARRAY_OFFSET + result.length - i - len, len);
+
+ i += len;
+ }
+
+ return UTF8String.fromBytes(result);
+ }
+
+ public UTF8String repeat(int times) {
+ if (times <= 0) {
+ return EMPTY_UTF8;
+ }
+
+ byte[] newBytes = new byte[numBytes * times];
+ copyMemory(this.base, this.offset, newBytes, BYTE_ARRAY_OFFSET, numBytes);
+
+ int copied = 1;
+ while (copied < times) {
+ int toCopy = Math.min(copied, times - copied);
+ System.arraycopy(newBytes, 0, newBytes, copied * numBytes, numBytes * toCopy);
+ copied += toCopy;
+ }
+
+ return UTF8String.fromBytes(newBytes);
+ }
+
+ /**
+ * Returns the position of the first occurrence of substr in
+ * current string from the specified position (0-based index).
+ *
+ * @param v the string to be searched
+ * @param start the start position of the current string for searching
+ * @return the position of the first occurrence of substr, if not found, -1 returned.
+ */
+ public int indexOf(UTF8String v, int start) {
+ if (v.numBytes() == 0) {
+ return 0;
+ }
+
+ // locate to the start position.
+ int i = 0; // position in byte
+ int c = 0; // position in character
+ while (i < numBytes && c < start) {
+ i += numBytesForFirstByte(getByte(i));
+ c += 1;
+ }
+
+ do {
+ if (i + v.numBytes > numBytes) {
+ return -1;
+ }
+ if (ByteArrayMethods.arrayEquals(base, offset + i, v.base, v.offset, v.numBytes)) {
+ return c;
+ }
+ i += numBytesForFirstByte(getByte(i));
+ c += 1;
+ } while (i < numBytes);
+
+ return -1;
+ }
+
+ /**
+ * Find the `str` from left to right.
+ */
+ private int find(UTF8String str, int start) {
+ assert (str.numBytes > 0);
+ while (start <= numBytes - str.numBytes) {
+ if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) {
+ return start;
+ }
+ start += 1;
+ }
+ return -1;
+ }
+
+ /**
+ * Find the `str` from right to left.
+ */
+ private int rfind(UTF8String str, int start) {
+ assert (str.numBytes > 0);
+ while (start >= 0) {
+ if (ByteArrayMethods.arrayEquals(base, offset + start, str.base, str.offset, str.numBytes)) {
+ return start;
+ }
+ start -= 1;
+ }
+ return -1;
+ }
+
+ /**
+ * Returns the substring from string str before count occurrences of the delimiter delim.
+ * If count is positive, everything the left of the final delimiter (counting from left) is
+ * returned. If count is negative, every to the right of the final delimiter (counting from the
+ * right) is returned. subStringIndex performs a case-sensitive match when searching for delim.
+ */
+ public UTF8String subStringIndex(UTF8String delim, int count) {
+ if (delim.numBytes == 0 || count == 0) {
+ return EMPTY_UTF8;
+ }
+ if (count > 0) {
+ int idx = -1;
+ while (count > 0) {
+ idx = find(delim, idx + 1);
+ if (idx >= 0) {
+ count --;
+ } else {
+ // can not find enough delim
+ return this;
+ }
+ }
+ if (idx == 0) {
+ return EMPTY_UTF8;
+ }
+ byte[] bytes = new byte[idx];
+ copyMemory(base, offset, bytes, BYTE_ARRAY_OFFSET, idx);
+ return fromBytes(bytes);
+
+ } else {
+ int idx = numBytes - delim.numBytes + 1;
+ count = -count;
+ while (count > 0) {
+ idx = rfind(delim, idx - 1);
+ if (idx >= 0) {
+ count --;
+ } else {
+ // can not find enough delim
+ return this;
+ }
+ }
+ if (idx + delim.numBytes == numBytes) {
+ return EMPTY_UTF8;
+ }
+ int size = numBytes - delim.numBytes - idx;
+ byte[] bytes = new byte[size];
+ copyMemory(base, offset + idx + delim.numBytes, bytes, BYTE_ARRAY_OFFSET, size);
+ return fromBytes(bytes);
+ }
+ }
+
+ /**
+ * Returns str, right-padded with pad to a length of len
+ * For example:
+ * ('hi', 5, '??') => 'hi???'
+ * ('hi', 1, '??') => 'h'
+ */
+ public UTF8String rpad(int len, UTF8String pad) {
+ int spaces = len - this.numChars(); // number of char need to pad
+ if (spaces <= 0 || pad.numBytes() == 0) {
+ // no padding at all, return the substring of the current string
+ return substring(0, len);
+ } else {
+ int padChars = pad.numChars();
+ int count = spaces / padChars; // how many padding string needed
+ // the partial string of the padding
+ UTF8String remain = pad.substring(0, spaces - padChars * count);
+
+ byte[] data = new byte[this.numBytes + pad.numBytes * count + remain.numBytes];
+ copyMemory(this.base, this.offset, data, BYTE_ARRAY_OFFSET, this.numBytes);
+ int offset = this.numBytes;
+ int idx = 0;
+ while (idx < count) {
+ copyMemory(pad.base, pad.offset, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes);
+ ++ idx;
+ offset += pad.numBytes;
+ }
+ copyMemory(remain.base, remain.offset, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes);
+
+ return UTF8String.fromBytes(data);
+ }
+ }
+
+ /**
+ * Returns str, left-padded with pad to a length of len.
+ * For example:
+ * ('hi', 5, '??') => '???hi'
+ * ('hi', 1, '??') => 'h'
+ */
+ public UTF8String lpad(int len, UTF8String pad) {
+ int spaces = len - this.numChars(); // number of char need to pad
+ if (spaces <= 0 || pad.numBytes() == 0) {
+ // no padding at all, return the substring of the current string
+ return substring(0, len);
+ } else {
+ int padChars = pad.numChars();
+ int count = spaces / padChars; // how many padding string needed
+ // the partial string of the padding
+ UTF8String remain = pad.substring(0, spaces - padChars * count);
+
+ byte[] data = new byte[this.numBytes + pad.numBytes * count + remain.numBytes];
+
+ int offset = 0;
+ int idx = 0;
+ while (idx < count) {
+ copyMemory(pad.base, pad.offset, data, BYTE_ARRAY_OFFSET + offset, pad.numBytes);
+ ++ idx;
+ offset += pad.numBytes;
+ }
+ copyMemory(remain.base, remain.offset, data, BYTE_ARRAY_OFFSET + offset, remain.numBytes);
+ offset += remain.numBytes;
+ copyMemory(this.base, this.offset, data, BYTE_ARRAY_OFFSET + offset, numBytes());
+
+ return UTF8String.fromBytes(data);
+ }
+ }
+
+ /**
+ * Concatenates input strings together into a single string. Returns null if any input is null.
+ */
+ public static UTF8String concat(UTF8String... inputs) {
+ // Compute the total length of the result.
+ int totalLength = 0;
+ for (int i = 0; i < inputs.length; i++) {
+ if (inputs[i] != null) {
+ totalLength += inputs[i].numBytes;
+ } else {
+ return null;
+ }
+ }
+
+ // Allocate a new byte array, and copy the inputs one by one into it.
+ final byte[] result = new byte[totalLength];
+ int offset = 0;
+ for (int i = 0; i < inputs.length; i++) {
+ int len = inputs[i].numBytes;
+ copyMemory(
+ inputs[i].base, inputs[i].offset,
+ result, BYTE_ARRAY_OFFSET + offset,
+ len);
+ offset += len;
+ }
+ return fromBytes(result);
+ }
+
+ /**
+ * Concatenates input strings together into a single string using the separator.
+ * A null input is skipped. For example, concat(",", "a", null, "c") would yield "a,c".
+ */
+ public static UTF8String concatWs(UTF8String separator, UTF8String... inputs) {
+ if (separator == null) {
+ return null;
+ }
+
+ int numInputBytes = 0; // total number of bytes from the inputs
+ int numInputs = 0; // number of non-null inputs
+ for (int i = 0; i < inputs.length; i++) {
+ if (inputs[i] != null) {
+ numInputBytes += inputs[i].numBytes;
+ numInputs++;
+ }
+ }
+
+ if (numInputs == 0) {
+ // Return an empty string if there is no input, or all the inputs are null.
+ return fromBytes(new byte[0]);
+ }
+
+ // Allocate a new byte array, and copy the inputs one by one into it.
+ // The size of the new array is the size of all inputs, plus the separators.
+ final byte[] result = new byte[numInputBytes + (numInputs - 1) * separator.numBytes];
+ int offset = 0;
+
+ for (int i = 0, j = 0; i < inputs.length; i++) {
+ if (inputs[i] != null) {
+ int len = inputs[i].numBytes;
+ copyMemory(
+ inputs[i].base, inputs[i].offset,
+ result, BYTE_ARRAY_OFFSET + offset,
+ len);
+ offset += len;
+
+ j++;
+ // Add separator if this is not the last input.
+ if (j < numInputs) {
+ copyMemory(
+ separator.base, separator.offset,
+ result, BYTE_ARRAY_OFFSET + offset,
+ separator.numBytes);
+ offset += separator.numBytes;
+ }
+ }
+ }
+ return fromBytes(result);
+ }
+
+ public UTF8String[] split(UTF8String pattern, int limit) {
+ String[] splits = toString().split(pattern.toString(), limit);
+ UTF8String[] res = new UTF8String[splits.length];
+ for (int i = 0; i < res.length; i++) {
+ res[i] = fromString(splits[i]);
+ }
+ return res;
+ }
+
+ // TODO: Need to use `Code Point` here instead of Char in case the character longer than 2 bytes
+ public UTF8String translate(Map<Character, Character> dict) {
+ String srcStr = this.toString();
+
+ StringBuilder sb = new StringBuilder();
+ for(int k = 0; k< srcStr.length(); k++) {
+ if (null == dict.get(srcStr.charAt(k))) {
+ sb.append(srcStr.charAt(k));
+ } else if ('\0' != dict.get(srcStr.charAt(k))){
+ sb.append(dict.get(srcStr.charAt(k)));
+ }
+ }
+ return fromString(sb.toString());
+ }
+
+ @Override
+ public String toString() {
+ try {
+ return new String(getBytes(), "utf-8");
+ } catch (UnsupportedEncodingException e) {
+ // Turn the exception into unchecked so we can find out about it at runtime, but
+ // don't need to add lots of boilerplate code everywhere.
+ throwException(e);
+ return "unknown"; // we will never reach here.
+ }
+ }
+
+ @Override
+ public UTF8String clone() {
+ return fromBytes(getBytes());
+ }
+
+ @Override
+ public int compareTo(@Nonnull final UTF8String other) {
+ int len = Math.min(numBytes, other.numBytes);
+ // TODO: compare 8 bytes as unsigned long
+ for (int i = 0; i < len; i ++) {
+ // In UTF-8, the byte should be unsigned, so we should compare them as unsigned int.
+ int res = (getByte(i) & 0xFF) - (other.getByte(i) & 0xFF);
+ if (res != 0) {
+ return res;
+ }
+ }
+ return numBytes - other.numBytes;
+ }
+
+ public int compare(final UTF8String other) {
+ return compareTo(other);
+ }
+
+ @Override
+ public boolean equals(final Object other) {
+ if (other instanceof UTF8String) {
+ UTF8String o = (UTF8String) other;
+ if (numBytes != o.numBytes) {
+ return false;
+ }
+ return ByteArrayMethods.arrayEquals(base, offset, o.base, o.offset, numBytes);
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Levenshtein distance is a metric for measuring the distance of two strings. The distance is
+ * defined by the minimum number of single-character edits (i.e. insertions, deletions or
+ * substitutions) that are required to change one of the strings into the other.
+ */
+ public int levenshteinDistance(UTF8String other) {
+ // Implementation adopted from org.apache.common.lang3.StringUtils.getLevenshteinDistance
+
+ int n = numChars();
+ int m = other.numChars();
+
+ if (n == 0) {
+ return m;
+ } else if (m == 0) {
+ return n;
+ }
+
+ UTF8String s, t;
+
+ if (n <= m) {
+ s = this;
+ t = other;
+ } else {
+ s = other;
+ t = this;
+ int swap;
+ swap = n;
+ n = m;
+ m = swap;
+ }
+
+ int[] p = new int[n + 1];
+ int[] d = new int[n + 1];
+ int[] swap;
+
+ int i, i_bytes, j, j_bytes, num_bytes_j, cost;
+
+ for (i = 0; i <= n; i++) {
+ p[i] = i;
+ }
+
+ for (j = 0, j_bytes = 0; j < m; j_bytes += num_bytes_j, j++) {
+ num_bytes_j = numBytesForFirstByte(t.getByte(j_bytes));
+ d[0] = j + 1;
+
+ for (i = 0, i_bytes = 0; i < n; i_bytes += numBytesForFirstByte(s.getByte(i_bytes)), i++) {
+ if (s.getByte(i_bytes) != t.getByte(j_bytes) ||
+ num_bytes_j != numBytesForFirstByte(s.getByte(i_bytes))) {
+ cost = 1;
+ } else {
+ cost = (ByteArrayMethods.arrayEquals(t.base, t.offset + j_bytes, s.base,
+ s.offset + i_bytes, num_bytes_j)) ? 0 : 1;
+ }
+ d[i + 1] = Math.min(Math.min(d[i] + 1, p[i + 1] + 1), p[i] + cost);
+ }
+
+ swap = p;
+ p = d;
+ d = swap;
+ }
+
+ return p[n];
+ }
+
+ @Override
+ public int hashCode() {
+ return Murmur3_x86_32.hashUnsafeBytes(base, offset, numBytes, 42);
+ }
+
+ /**
+ * Soundex mapping table
+ */
+ private static final byte[] US_ENGLISH_MAPPING = {'0', '1', '2', '3', '0', '1', '2', '7',
+ '0', '2', '2', '4', '5', '5', '0', '1', '2', '6', '2', '3', '0', '1', '7', '2', '0', '2'};
+
+ /**
+ * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names,
+ * but can also be used as a general purpose scheme to find word with similar phonemes.
+ * https://en.wikipedia.org/wiki/Soundex
+ */
+ public UTF8String soundex() {
+ if (numBytes == 0) {
+ return EMPTY_UTF8;
+ }
+
+ byte b = getByte(0);
+ if ('a' <= b && b <= 'z') {
+ b -= 32;
+ } else if (b < 'A' || 'Z' < b) {
+ // first character must be a letter
+ return this;
+ }
+ byte[] sx = {'0', '0', '0', '0'};
+ sx[0] = b;
+ int sxi = 1;
+ int idx = b - 'A';
+ byte lastCode = US_ENGLISH_MAPPING[idx];
+
+ for (int i = 1; i < numBytes; i++) {
+ b = getByte(i);
+ if ('a' <= b && b <= 'z') {
+ b -= 32;
+ } else if (b < 'A' || 'Z' < b) {
+ // not a letter, skip it
+ lastCode = '0';
+ continue;
+ }
+ idx = b - 'A';
+ byte code = US_ENGLISH_MAPPING[idx];
+ if (code == '7') {
+ // ignore it
+ } else {
+ if (code != '0' && code != lastCode) {
+ sx[sxi++] = code;
+ if (sxi > 3) break;
+ }
+ lastCode = code;
+ }
+ }
+ return UTF8String.fromBytes(sx);
+ }
+
+ public void writeExternal(ObjectOutput out) throws IOException {
+ byte[] bytes = getBytes();
+ out.writeInt(bytes.length);
+ out.write(bytes);
+ }
+
+ public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
+ offset = BYTE_ARRAY_OFFSET;
+ numBytes = in.readInt();
+ base = new byte[numBytes];
+ in.readFully((byte[]) base);
+ }
+
+ @Override
+ public void write(Kryo kryo, Output out) {
+ byte[] bytes = getBytes();
+ out.writeInt(bytes.length);
+ out.write(bytes);
+ }
+
+ @Override
+ public void read(Kryo kryo, Input in) {
+ this.offset = BYTE_ARRAY_OFFSET;
+ this.numBytes = in.readInt();
+ this.base = new byte[numBytes];
+ in.read((byte[]) base);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
----------------------------------------------------------------------
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
new file mode 100644
index 0000000..693ec6e
--- /dev/null
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/PlatformUtilSuite.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class PlatformUtilSuite {
+
+ @Test
+ public void overlappingCopyMemory() {
+ byte[] data = new byte[3 * 1024 * 1024];
+ int size = 2 * 1024 * 1024;
+ for (int i = 0; i < data.length; ++i) {
+ data[i] = (byte)i;
+ }
+
+ Platform.copyMemory(data, Platform.BYTE_ARRAY_OFFSET, data, Platform.BYTE_ARRAY_OFFSET, size);
+ for (int i = 0; i < data.length; ++i) {
+ Assert.assertEquals((byte)i, data[i]);
+ }
+
+ Platform.copyMemory(
+ data,
+ Platform.BYTE_ARRAY_OFFSET + 1,
+ data,
+ Platform.BYTE_ARRAY_OFFSET,
+ size);
+ for (int i = 0; i < size; ++i) {
+ Assert.assertEquals((byte)(i + 1), data[i]);
+ }
+
+ for (int i = 0; i < data.length; ++i) {
+ data[i] = (byte)i;
+ }
+ Platform.copyMemory(
+ data,
+ Platform.BYTE_ARRAY_OFFSET,
+ data,
+ Platform.BYTE_ARRAY_OFFSET + 1,
+ size);
+ for (int i = 0; i < size; ++i) {
+ Assert.assertEquals((byte)i, data[i + 1]);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/common/unsafe/src/test/java/org/apache/spark/unsafe/array/LongArraySuite.java
----------------------------------------------------------------------
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/array/LongArraySuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/array/LongArraySuite.java
new file mode 100644
index 0000000..fb8e53b
--- /dev/null
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/array/LongArraySuite.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.array;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.spark.unsafe.memory.MemoryBlock;
+
+public class LongArraySuite {
+
+ @Test
+ public void basicTest() {
+ long[] bytes = new long[2];
+ LongArray arr = new LongArray(MemoryBlock.fromLongArray(bytes));
+ arr.set(0, 1L);
+ arr.set(1, 2L);
+ arr.set(1, 3L);
+ Assert.assertEquals(2, arr.size());
+ Assert.assertEquals(1L, arr.get(0));
+ Assert.assertEquals(3L, arr.get(1));
+
+ arr.zeroOut();
+ Assert.assertEquals(0L, arr.get(0));
+ Assert.assertEquals(0L, arr.get(1));
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java
----------------------------------------------------------------------
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java
new file mode 100644
index 0000000..e759cb3
--- /dev/null
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/hash/Murmur3_x86_32Suite.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.hash;
+
+import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.spark.unsafe.Platform;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Test file based on Guava's Murmur3Hash32Test.
+ */
+public class Murmur3_x86_32Suite {
+
+ private static final Murmur3_x86_32 hasher = new Murmur3_x86_32(0);
+
+ @Test
+ public void testKnownIntegerInputs() {
+ Assert.assertEquals(593689054, hasher.hashInt(0));
+ Assert.assertEquals(-189366624, hasher.hashInt(-42));
+ Assert.assertEquals(-1134849565, hasher.hashInt(42));
+ Assert.assertEquals(-1718298732, hasher.hashInt(Integer.MIN_VALUE));
+ Assert.assertEquals(-1653689534, hasher.hashInt(Integer.MAX_VALUE));
+ }
+
+ @Test
+ public void testKnownLongInputs() {
+ Assert.assertEquals(1669671676, hasher.hashLong(0L));
+ Assert.assertEquals(-846261623, hasher.hashLong(-42L));
+ Assert.assertEquals(1871679806, hasher.hashLong(42L));
+ Assert.assertEquals(1366273829, hasher.hashLong(Long.MIN_VALUE));
+ Assert.assertEquals(-2106506049, hasher.hashLong(Long.MAX_VALUE));
+ }
+
+ @Test
+ public void randomizedStressTest() {
+ int size = 65536;
+ Random rand = new Random();
+
+ // A set used to track collision rate.
+ Set<Integer> hashcodes = new HashSet<>();
+ for (int i = 0; i < size; i++) {
+ int vint = rand.nextInt();
+ long lint = rand.nextLong();
+ Assert.assertEquals(hasher.hashInt(vint), hasher.hashInt(vint));
+ Assert.assertEquals(hasher.hashLong(lint), hasher.hashLong(lint));
+
+ hashcodes.add(hasher.hashLong(lint));
+ }
+
+ // A very loose bound.
+ Assert.assertTrue(hashcodes.size() > size * 0.95);
+ }
+
+ @Test
+ public void randomizedStressTestBytes() {
+ int size = 65536;
+ Random rand = new Random();
+
+ // A set used to track collision rate.
+ Set<Integer> hashcodes = new HashSet<>();
+ for (int i = 0; i < size; i++) {
+ int byteArrSize = rand.nextInt(100) * 8;
+ byte[] bytes = new byte[byteArrSize];
+ rand.nextBytes(bytes);
+
+ Assert.assertEquals(
+ hasher.hashUnsafeWords(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize),
+ hasher.hashUnsafeWords(bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+
+ hashcodes.add(hasher.hashUnsafeWords(
+ bytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+ }
+
+ // A very loose bound.
+ Assert.assertTrue(hashcodes.size() > size * 0.95);
+ }
+
+ @Test
+ public void randomizedStressTestPaddedStrings() {
+ int size = 64000;
+ // A set used to track collision rate.
+ Set<Integer> hashcodes = new HashSet<>();
+ for (int i = 0; i < size; i++) {
+ int byteArrSize = 8;
+ byte[] strBytes = String.valueOf(i).getBytes(StandardCharsets.UTF_8);
+ byte[] paddedBytes = new byte[byteArrSize];
+ System.arraycopy(strBytes, 0, paddedBytes, 0, strBytes.length);
+
+ Assert.assertEquals(
+ hasher.hashUnsafeWords(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize),
+ hasher.hashUnsafeWords(paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+
+ hashcodes.add(hasher.hashUnsafeWords(
+ paddedBytes, Platform.BYTE_ARRAY_OFFSET, byteArrSize));
+ }
+
+ // A very loose bound.
+ Assert.assertTrue(hashcodes.size() > size * 0.95);
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java
----------------------------------------------------------------------
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java
new file mode 100644
index 0000000..9e69e26
--- /dev/null
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CalendarIntervalSuite.java
@@ -0,0 +1,240 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.unsafe.types;
+
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+import static org.apache.spark.unsafe.types.CalendarInterval.*;
+
+public class CalendarIntervalSuite {
+
+ @Test
+ public void equalsTest() {
+ CalendarInterval i1 = new CalendarInterval(3, 123);
+ CalendarInterval i2 = new CalendarInterval(3, 321);
+ CalendarInterval i3 = new CalendarInterval(1, 123);
+ CalendarInterval i4 = new CalendarInterval(3, 123);
+
+ assertNotSame(i1, i2);
+ assertNotSame(i1, i3);
+ assertNotSame(i2, i3);
+ assertEquals(i1, i4);
+ }
+
+ @Test
+ public void toStringTest() {
+ CalendarInterval i;
+
+ i = new CalendarInterval(34, 0);
+ assertEquals("interval 2 years 10 months", i.toString());
+
+ i = new CalendarInterval(-34, 0);
+ assertEquals("interval -2 years -10 months", i.toString());
+
+ i = new CalendarInterval(0, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
+ assertEquals("interval 3 weeks 13 hours 123 microseconds", i.toString());
+
+ i = new CalendarInterval(0, -3 * MICROS_PER_WEEK - 13 * MICROS_PER_HOUR - 123);
+ assertEquals("interval -3 weeks -13 hours -123 microseconds", i.toString());
+
+ i = new CalendarInterval(34, 3 * MICROS_PER_WEEK + 13 * MICROS_PER_HOUR + 123);
+ assertEquals("interval 2 years 10 months 3 weeks 13 hours 123 microseconds", i.toString());
+ }
+
+ @Test
+ public void fromStringTest() {
+ testSingleUnit("year", 3, 36, 0);
+ testSingleUnit("month", 3, 3, 0);
+ testSingleUnit("week", 3, 0, 3 * MICROS_PER_WEEK);
+ testSingleUnit("day", 3, 0, 3 * MICROS_PER_DAY);
+ testSingleUnit("hour", 3, 0, 3 * MICROS_PER_HOUR);
+ testSingleUnit("minute", 3, 0, 3 * MICROS_PER_MINUTE);
+ testSingleUnit("second", 3, 0, 3 * MICROS_PER_SECOND);
+ testSingleUnit("millisecond", 3, 0, 3 * MICROS_PER_MILLI);
+ testSingleUnit("microsecond", 3, 0, 3);
+
+ String input;
+
+ input = "interval -5 years 23 month";
+ CalendarInterval result = new CalendarInterval(-5 * 12 + 23, 0);
+ assertEquals(fromString(input), result);
+
+ input = "interval -5 years 23 month ";
+ assertEquals(fromString(input), result);
+
+ input = " interval -5 years 23 month ";
+ assertEquals(fromString(input), result);
+
+ // Error cases
+ input = "interval 3month 1 hour";
+ assertNull(fromString(input));
+
+ input = "interval 3 moth 1 hour";
+ assertNull(fromString(input));
+
+ input = "interval";
+ assertNull(fromString(input));
+
+ input = "int";
+ assertNull(fromString(input));
+
+ input = "";
+ assertNull(fromString(input));
+
+ input = null;
+ assertNull(fromString(input));
+ }
+
+ @Test
+ public void fromYearMonthStringTest() {
+ String input;
+ CalendarInterval i;
+
+ input = "99-10";
+ i = new CalendarInterval(99 * 12 + 10, 0L);
+ assertEquals(fromYearMonthString(input), i);
+
+ input = "-8-10";
+ i = new CalendarInterval(-8 * 12 - 10, 0L);
+ assertEquals(fromYearMonthString(input), i);
+
+ try {
+ input = "99-15";
+ fromYearMonthString(input);
+ fail("Expected to throw an exception for the invalid input");
+ } catch (IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("month 15 outside range"));
+ }
+ }
+
+ @Test
+ public void fromDayTimeStringTest() {
+ String input;
+ CalendarInterval i;
+
+ input = "5 12:40:30.999999999";
+ i = new CalendarInterval(0, 5 * MICROS_PER_DAY + 12 * MICROS_PER_HOUR +
+ 40 * MICROS_PER_MINUTE + 30 * MICROS_PER_SECOND + 999999L);
+ assertEquals(fromDayTimeString(input), i);
+
+ input = "10 0:12:0.888";
+ i = new CalendarInterval(0, 10 * MICROS_PER_DAY + 12 * MICROS_PER_MINUTE);
+ assertEquals(fromDayTimeString(input), i);
+
+ input = "-3 0:0:0";
+ i = new CalendarInterval(0, -3 * MICROS_PER_DAY);
+ assertEquals(fromDayTimeString(input), i);
+
+ try {
+ input = "5 30:12:20";
+ fromDayTimeString(input);
+ fail("Expected to throw an exception for the invalid input");
+ } catch (IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("hour 30 outside range"));
+ }
+
+ try {
+ input = "5 30-12";
+ fromDayTimeString(input);
+ fail("Expected to throw an exception for the invalid input");
+ } catch (IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("not match day-time format"));
+ }
+ }
+
+ @Test
+ public void fromSingleUnitStringTest() {
+ String input;
+ CalendarInterval i;
+
+ input = "12";
+ i = new CalendarInterval(12 * 12, 0L);
+ assertEquals(fromSingleUnitString("year", input), i);
+
+ input = "100";
+ i = new CalendarInterval(0, 100 * MICROS_PER_DAY);
+ assertEquals(fromSingleUnitString("day", input), i);
+
+ input = "1999.38888";
+ i = new CalendarInterval(0, 1999 * MICROS_PER_SECOND + 38);
+ assertEquals(fromSingleUnitString("second", input), i);
+
+ try {
+ input = String.valueOf(Integer.MAX_VALUE);
+ fromSingleUnitString("year", input);
+ fail("Expected to throw an exception for the invalid input");
+ } catch (IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("outside range"));
+ }
+
+ try {
+ input = String.valueOf(Long.MAX_VALUE / MICROS_PER_HOUR + 1);
+ fromSingleUnitString("hour", input);
+ fail("Expected to throw an exception for the invalid input");
+ } catch (IllegalArgumentException e) {
+ assertTrue(e.getMessage().contains("outside range"));
+ }
+ }
+
+ @Test
+ public void addTest() {
+ String input = "interval 3 month 1 hour";
+ String input2 = "interval 2 month 100 hour";
+
+ CalendarInterval interval = fromString(input);
+ CalendarInterval interval2 = fromString(input2);
+
+ assertEquals(interval.add(interval2), new CalendarInterval(5, 101 * MICROS_PER_HOUR));
+
+ input = "interval -10 month -81 hour";
+ input2 = "interval 75 month 200 hour";
+
+ interval = fromString(input);
+ interval2 = fromString(input2);
+
+ assertEquals(interval.add(interval2), new CalendarInterval(65, 119 * MICROS_PER_HOUR));
+ }
+
+ @Test
+ public void subtractTest() {
+ String input = "interval 3 month 1 hour";
+ String input2 = "interval 2 month 100 hour";
+
+ CalendarInterval interval = fromString(input);
+ CalendarInterval interval2 = fromString(input2);
+
+ assertEquals(interval.subtract(interval2), new CalendarInterval(1, -99 * MICROS_PER_HOUR));
+
+ input = "interval -10 month -81 hour";
+ input2 = "interval 75 month 200 hour";
+
+ interval = fromString(input);
+ interval2 = fromString(input2);
+
+ assertEquals(interval.subtract(interval2), new CalendarInterval(-85, -281 * MICROS_PER_HOUR));
+ }
+
+ private static void testSingleUnit(String unit, int number, int months, long microseconds) {
+ String input1 = "interval " + number + " " + unit;
+ String input2 = "interval " + number + " " + unit + "s";
+ CalendarInterval result = new CalendarInterval(months, microseconds);
+ assertEquals(fromString(input1), result);
+ assertEquals(fromString(input2), result);
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
----------------------------------------------------------------------
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
new file mode 100644
index 0000000..bef5d71
--- /dev/null
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -0,0 +1,492 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.unsafe.types;
+
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import com.google.common.collect.ImmutableMap;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+import static org.apache.spark.unsafe.types.UTF8String.*;
+
+public class UTF8StringSuite {
+
+ private static void checkBasic(String str, int len) throws UnsupportedEncodingException {
+ UTF8String s1 = fromString(str);
+ UTF8String s2 = fromBytes(str.getBytes("utf8"));
+ assertEquals(s1.numChars(), len);
+ assertEquals(s2.numChars(), len);
+
+ assertEquals(s1.toString(), str);
+ assertEquals(s2.toString(), str);
+ assertEquals(s1, s2);
+
+ assertEquals(s1.hashCode(), s2.hashCode());
+
+ assertEquals(0, s1.compareTo(s2));
+
+ assertTrue(s1.contains(s2));
+ assertTrue(s2.contains(s1));
+ assertTrue(s1.startsWith(s1));
+ assertTrue(s1.endsWith(s1));
+ }
+
+ @Test
+ public void basicTest() throws UnsupportedEncodingException {
+ checkBasic("", 0);
+ checkBasic("hello", 5);
+ checkBasic("大 千 世 界", 7);
+ }
+
+ @Test
+ public void emptyStringTest() {
+ assertEquals(EMPTY_UTF8, fromString(""));
+ assertEquals(EMPTY_UTF8, fromBytes(new byte[0]));
+ assertEquals(0, EMPTY_UTF8.numChars());
+ assertEquals(0, EMPTY_UTF8.numBytes());
+ }
+
+ @Test
+ public void prefix() {
+ assertTrue(fromString("a").getPrefix() - fromString("b").getPrefix() < 0);
+ assertTrue(fromString("ab").getPrefix() - fromString("b").getPrefix() < 0);
+ assertTrue(
+ fromString("abbbbbbbbbbbasdf").getPrefix() - fromString("bbbbbbbbbbbbasdf").getPrefix() < 0);
+ assertTrue(fromString("").getPrefix() - fromString("a").getPrefix() < 0);
+ assertTrue(fromString("你好").getPrefix() - fromString("世界").getPrefix() > 0);
+
+ byte[] buf1 = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+ byte[] buf2 = {1, 2, 3};
+ UTF8String str1 = fromBytes(buf1, 0, 3);
+ UTF8String str2 = fromBytes(buf1, 0, 8);
+ UTF8String str3 = fromBytes(buf2);
+ assertTrue(str1.getPrefix() - str2.getPrefix() < 0);
+ assertEquals(str1.getPrefix(), str3.getPrefix());
+ }
+
+ @Test
+ public void compareTo() {
+ assertTrue(fromString("").compareTo(fromString("a")) < 0);
+ assertTrue(fromString("abc").compareTo(fromString("ABC")) > 0);
+ assertTrue(fromString("abc0").compareTo(fromString("abc")) > 0);
+ assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabc")) == 0);
+ assertTrue(fromString("aBcabcabc").compareTo(fromString("Abcabcabc")) > 0);
+ assertTrue(fromString("Abcabcabc").compareTo(fromString("abcabcabC")) < 0);
+ assertTrue(fromString("abcabcabc").compareTo(fromString("abcabcabC")) > 0);
+
+ assertTrue(fromString("abc").compareTo(fromString("世界")) < 0);
+ assertTrue(fromString("你好").compareTo(fromString("世界")) > 0);
+ assertTrue(fromString("你好123").compareTo(fromString("你好122")) > 0);
+ }
+
+ protected static void testUpperandLower(String upper, String lower) {
+ UTF8String us = fromString(upper);
+ UTF8String ls = fromString(lower);
+ assertEquals(ls, us.toLowerCase());
+ assertEquals(us, ls.toUpperCase());
+ assertEquals(us, us.toUpperCase());
+ assertEquals(ls, ls.toLowerCase());
+ }
+
+ @Test
+ public void upperAndLower() {
+ testUpperandLower("", "");
+ testUpperandLower("0123456", "0123456");
+ testUpperandLower("ABCXYZ", "abcxyz");
+ testUpperandLower("ЀЁЂѺΏỀ", "ѐёђѻώề");
+ testUpperandLower("大千世界 数据砖头", "大千世界 数据砖头");
+ }
+
+ @Test
+ public void titleCase() {
+ assertEquals(fromString(""), fromString("").toTitleCase());
+ assertEquals(fromString("Ab Bc Cd"), fromString("ab bc cd").toTitleCase());
+ assertEquals(fromString("Ѐ Ё Ђ Ѻ Ώ Ề"), fromString("ѐ ё ђ ѻ ώ ề").toTitleCase());
+ assertEquals(fromString("大千世界 数据砖头"), fromString("大千世界 数据砖头").toTitleCase());
+ }
+
+ @Test
+ public void concatTest() {
+ assertEquals(EMPTY_UTF8, concat());
+ assertNull(concat((UTF8String) null));
+ assertEquals(EMPTY_UTF8, concat(EMPTY_UTF8));
+ assertEquals(fromString("ab"), concat(fromString("ab")));
+ assertEquals(fromString("ab"), concat(fromString("a"), fromString("b")));
+ assertEquals(fromString("abc"), concat(fromString("a"), fromString("b"), fromString("c")));
+ assertNull(concat(fromString("a"), null, fromString("c")));
+ assertNull(concat(fromString("a"), null, null));
+ assertNull(concat(null, null, null));
+ assertEquals(fromString("数据砖头"), concat(fromString("数据"), fromString("砖头")));
+ }
+
+ @Test
+ public void concatWsTest() {
+ // Returns null if the separator is null
+ assertNull(concatWs(null, (UTF8String) null));
+ assertNull(concatWs(null, fromString("a")));
+
+ // If separator is null, concatWs should skip all null inputs and never return null.
+ UTF8String sep = fromString("哈哈");
+ assertEquals(
+ EMPTY_UTF8,
+ concatWs(sep, EMPTY_UTF8));
+ assertEquals(
+ fromString("ab"),
+ concatWs(sep, fromString("ab")));
+ assertEquals(
+ fromString("a哈哈b"),
+ concatWs(sep, fromString("a"), fromString("b")));
+ assertEquals(
+ fromString("a哈哈b哈哈c"),
+ concatWs(sep, fromString("a"), fromString("b"), fromString("c")));
+ assertEquals(
+ fromString("a哈哈c"),
+ concatWs(sep, fromString("a"), null, fromString("c")));
+ assertEquals(
+ fromString("a"),
+ concatWs(sep, fromString("a"), null, null));
+ assertEquals(
+ EMPTY_UTF8,
+ concatWs(sep, null, null, null));
+ assertEquals(
+ fromString("数据哈哈砖头"),
+ concatWs(sep, fromString("数据"), fromString("砖头")));
+ }
+
+ @Test
+ public void contains() {
+ assertTrue(EMPTY_UTF8.contains(EMPTY_UTF8));
+ assertTrue(fromString("hello").contains(fromString("ello")));
+ assertFalse(fromString("hello").contains(fromString("vello")));
+ assertFalse(fromString("hello").contains(fromString("hellooo")));
+ assertTrue(fromString("大千世界").contains(fromString("千世界")));
+ assertFalse(fromString("大千世界").contains(fromString("世千")));
+ assertFalse(fromString("大千世界").contains(fromString("大千世界好")));
+ }
+
+ @Test
+ public void startsWith() {
+ assertTrue(EMPTY_UTF8.startsWith(EMPTY_UTF8));
+ assertTrue(fromString("hello").startsWith(fromString("hell")));
+ assertFalse(fromString("hello").startsWith(fromString("ell")));
+ assertFalse(fromString("hello").startsWith(fromString("hellooo")));
+ assertTrue(fromString("数据砖头").startsWith(fromString("数据")));
+ assertFalse(fromString("大千世界").startsWith(fromString("千")));
+ assertFalse(fromString("大千世界").startsWith(fromString("大千世界好")));
+ }
+
+ @Test
+ public void endsWith() {
+ assertTrue(EMPTY_UTF8.endsWith(EMPTY_UTF8));
+ assertTrue(fromString("hello").endsWith(fromString("ello")));
+ assertFalse(fromString("hello").endsWith(fromString("ellov")));
+ assertFalse(fromString("hello").endsWith(fromString("hhhello")));
+ assertTrue(fromString("大千世界").endsWith(fromString("世界")));
+ assertFalse(fromString("大千世界").endsWith(fromString("世")));
+ assertFalse(fromString("数据砖头").endsWith(fromString("我的数据砖头")));
+ }
+
+ @Test
+ public void substring() {
+ assertEquals(EMPTY_UTF8, fromString("hello").substring(0, 0));
+ assertEquals(fromString("el"), fromString("hello").substring(1, 3));
+ assertEquals(fromString("数"), fromString("数据砖头").substring(0, 1));
+ assertEquals(fromString("据砖"), fromString("数据砖头").substring(1, 3));
+ assertEquals(fromString("头"), fromString("数据砖头").substring(3, 5));
+ assertEquals(fromString("ߵ梷"), fromString("ߵ梷").substring(0, 2));
+ }
+
+ @Test
+ public void trims() {
+ assertEquals(fromString("hello"), fromString(" hello ").trim());
+ assertEquals(fromString("hello "), fromString(" hello ").trimLeft());
+ assertEquals(fromString(" hello"), fromString(" hello ").trimRight());
+
+ assertEquals(EMPTY_UTF8, fromString(" ").trim());
+ assertEquals(EMPTY_UTF8, fromString(" ").trimLeft());
+ assertEquals(EMPTY_UTF8, fromString(" ").trimRight());
+
+ assertEquals(fromString("数据砖头"), fromString(" 数据砖头 ").trim());
+ assertEquals(fromString("数据砖头 "), fromString(" 数据砖头 ").trimLeft());
+ assertEquals(fromString(" 数据砖头"), fromString(" 数据砖头 ").trimRight());
+
+ assertEquals(fromString("数据砖头"), fromString("数据砖头").trim());
+ assertEquals(fromString("数据砖头"), fromString("数据砖头").trimLeft());
+ assertEquals(fromString("数据砖头"), fromString("数据砖头").trimRight());
+ }
+
+ @Test
+ public void indexOf() {
+ assertEquals(0, EMPTY_UTF8.indexOf(EMPTY_UTF8, 0));
+ assertEquals(-1, EMPTY_UTF8.indexOf(fromString("l"), 0));
+ assertEquals(0, fromString("hello").indexOf(EMPTY_UTF8, 0));
+ assertEquals(2, fromString("hello").indexOf(fromString("l"), 0));
+ assertEquals(3, fromString("hello").indexOf(fromString("l"), 3));
+ assertEquals(-1, fromString("hello").indexOf(fromString("a"), 0));
+ assertEquals(2, fromString("hello").indexOf(fromString("ll"), 0));
+ assertEquals(-1, fromString("hello").indexOf(fromString("ll"), 4));
+ assertEquals(1, fromString("数据砖头").indexOf(fromString("据砖"), 0));
+ assertEquals(-1, fromString("数据砖头").indexOf(fromString("数"), 3));
+ assertEquals(0, fromString("数据砖头").indexOf(fromString("数"), 0));
+ assertEquals(3, fromString("数据砖头").indexOf(fromString("头"), 0));
+ }
+
+ @Test
+ public void substring_index() {
+ assertEquals(fromString("www.apache.org"),
+ fromString("www.apache.org").subStringIndex(fromString("."), 3));
+ assertEquals(fromString("www.apache"),
+ fromString("www.apache.org").subStringIndex(fromString("."), 2));
+ assertEquals(fromString("www"),
+ fromString("www.apache.org").subStringIndex(fromString("."), 1));
+ assertEquals(fromString(""),
+ fromString("www.apache.org").subStringIndex(fromString("."), 0));
+ assertEquals(fromString("org"),
+ fromString("www.apache.org").subStringIndex(fromString("."), -1));
+ assertEquals(fromString("apache.org"),
+ fromString("www.apache.org").subStringIndex(fromString("."), -2));
+ assertEquals(fromString("www.apache.org"),
+ fromString("www.apache.org").subStringIndex(fromString("."), -3));
+ // str is empty string
+ assertEquals(fromString(""),
+ fromString("").subStringIndex(fromString("."), 1));
+ // empty string delim
+ assertEquals(fromString(""),
+ fromString("www.apache.org").subStringIndex(fromString(""), 1));
+ // delim does not exist in str
+ assertEquals(fromString("www.apache.org"),
+ fromString("www.apache.org").subStringIndex(fromString("#"), 2));
+ // delim is 2 chars
+ assertEquals(fromString("www||apache"),
+ fromString("www||apache||org").subStringIndex(fromString("||"), 2));
+ assertEquals(fromString("apache||org"),
+ fromString("www||apache||org").subStringIndex(fromString("||"), -2));
+ // non ascii chars
+ assertEquals(fromString("大千世界大"),
+ fromString("大千世界大千世界").subStringIndex(fromString("千"), 2));
+ // overlapped delim
+ assertEquals(fromString("||"), fromString("||||||").subStringIndex(fromString("|||"), 3));
+ assertEquals(fromString("|||"), fromString("||||||").subStringIndex(fromString("|||"), -4));
+ }
+
+ @Test
+ public void reverse() {
+ assertEquals(fromString("olleh"), fromString("hello").reverse());
+ assertEquals(EMPTY_UTF8, EMPTY_UTF8.reverse());
+ assertEquals(fromString("者行孙"), fromString("孙行者").reverse());
+ assertEquals(fromString("者行孙 olleh"), fromString("hello 孙行者").reverse());
+ }
+
+ @Test
+ public void repeat() {
+ assertEquals(fromString("数d数d数d数d数d"), fromString("数d").repeat(5));
+ assertEquals(fromString("数d"), fromString("数d").repeat(1));
+ assertEquals(EMPTY_UTF8, fromString("数d").repeat(-1));
+ }
+
+ @Test
+ public void pad() {
+ assertEquals(fromString("hel"), fromString("hello").lpad(3, fromString("????")));
+ assertEquals(fromString("hello"), fromString("hello").lpad(5, fromString("????")));
+ assertEquals(fromString("?hello"), fromString("hello").lpad(6, fromString("????")));
+ assertEquals(fromString("???????hello"), fromString("hello").lpad(12, fromString("????")));
+ assertEquals(fromString("?????hello"), fromString("hello").lpad(10, fromString("?????")));
+ assertEquals(fromString("???????"), EMPTY_UTF8.lpad(7, fromString("?????")));
+
+ assertEquals(fromString("hel"), fromString("hello").rpad(3, fromString("????")));
+ assertEquals(fromString("hello"), fromString("hello").rpad(5, fromString("????")));
+ assertEquals(fromString("hello?"), fromString("hello").rpad(6, fromString("????")));
+ assertEquals(fromString("hello???????"), fromString("hello").rpad(12, fromString("????")));
+ assertEquals(fromString("hello?????"), fromString("hello").rpad(10, fromString("?????")));
+ assertEquals(fromString("???????"), EMPTY_UTF8.rpad(7, fromString("?????")));
+
+ assertEquals(fromString("数据砖"), fromString("数据砖头").lpad(3, fromString("????")));
+ assertEquals(fromString("?数据砖头"), fromString("数据砖头").lpad(5, fromString("????")));
+ assertEquals(fromString("??数据砖头"), fromString("数据砖头").lpad(6, fromString("????")));
+ assertEquals(fromString("孙行数据砖头"), fromString("数据砖头").lpad(6, fromString("孙行者")));
+ assertEquals(fromString("孙行者数据砖头"), fromString("数据砖头").lpad(7, fromString("孙行者")));
+ assertEquals(
+ fromString("孙行者孙行者孙行数据砖头"),
+ fromString("数据砖头").lpad(12, fromString("孙行者")));
+
+ assertEquals(fromString("数据砖"), fromString("数据砖头").rpad(3, fromString("????")));
+ assertEquals(fromString("数据砖头?"), fromString("数据砖头").rpad(5, fromString("????")));
+ assertEquals(fromString("数据砖头??"), fromString("数据砖头").rpad(6, fromString("????")));
+ assertEquals(fromString("数据砖头孙行"), fromString("数据砖头").rpad(6, fromString("孙行者")));
+ assertEquals(fromString("数据砖头孙行者"), fromString("数据砖头").rpad(7, fromString("孙行者")));
+ assertEquals(
+ fromString("数据砖头孙行者孙行者孙行"),
+ fromString("数据砖头").rpad(12, fromString("孙行者")));
+
+ assertEquals(EMPTY_UTF8, fromString("数据砖头").lpad(-10, fromString("孙行者")));
+ assertEquals(EMPTY_UTF8, fromString("数据砖头").lpad(-10, EMPTY_UTF8));
+ assertEquals(fromString("数据砖头"), fromString("数据砖头").lpad(5, EMPTY_UTF8));
+ assertEquals(fromString("数据砖"), fromString("数据砖头").lpad(3, EMPTY_UTF8));
+ assertEquals(EMPTY_UTF8, EMPTY_UTF8.lpad(3, EMPTY_UTF8));
+
+ assertEquals(EMPTY_UTF8, fromString("数据砖头").rpad(-10, fromString("孙行者")));
+ assertEquals(EMPTY_UTF8, fromString("数据砖头").rpad(-10, EMPTY_UTF8));
+ assertEquals(fromString("数据砖头"), fromString("数据砖头").rpad(5, EMPTY_UTF8));
+ assertEquals(fromString("数据砖"), fromString("数据砖头").rpad(3, EMPTY_UTF8));
+ assertEquals(EMPTY_UTF8, EMPTY_UTF8.rpad(3, EMPTY_UTF8));
+ }
+
+ @Test
+ public void substringSQL() {
+ UTF8String e = fromString("example");
+ assertEquals(e.substringSQL(0, 2), fromString("ex"));
+ assertEquals(e.substringSQL(1, 2), fromString("ex"));
+ assertEquals(e.substringSQL(0, 7), fromString("example"));
+ assertEquals(e.substringSQL(1, 2), fromString("ex"));
+ assertEquals(e.substringSQL(0, 100), fromString("example"));
+ assertEquals(e.substringSQL(1, 100), fromString("example"));
+ assertEquals(e.substringSQL(2, 2), fromString("xa"));
+ assertEquals(e.substringSQL(1, 6), fromString("exampl"));
+ assertEquals(e.substringSQL(2, 100), fromString("xample"));
+ assertEquals(e.substringSQL(0, 0), fromString(""));
+ assertEquals(e.substringSQL(100, 4), EMPTY_UTF8);
+ assertEquals(e.substringSQL(0, Integer.MAX_VALUE), fromString("example"));
+ assertEquals(e.substringSQL(1, Integer.MAX_VALUE), fromString("example"));
+ assertEquals(e.substringSQL(2, Integer.MAX_VALUE), fromString("xample"));
+ }
+
+ @Test
+ public void split() {
+ assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), -1),
+ new UTF8String[]{fromString("ab"), fromString("def"), fromString("ghi")}));
+ assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
+ new UTF8String[]{fromString("ab"), fromString("def,ghi")}));
+ assertTrue(Arrays.equals(fromString("ab,def,ghi").split(fromString(","), 2),
+ new UTF8String[]{fromString("ab"), fromString("def,ghi")}));
+ }
+
+ @Test
+ public void levenshteinDistance() {
+ assertEquals(0, EMPTY_UTF8.levenshteinDistance(EMPTY_UTF8));
+ assertEquals(1, EMPTY_UTF8.levenshteinDistance(fromString("a")));
+ assertEquals(7, fromString("aaapppp").levenshteinDistance(EMPTY_UTF8));
+ assertEquals(1, fromString("frog").levenshteinDistance(fromString("fog")));
+ assertEquals(3, fromString("fly").levenshteinDistance(fromString("ant")));
+ assertEquals(7, fromString("elephant").levenshteinDistance(fromString("hippo")));
+ assertEquals(7, fromString("hippo").levenshteinDistance(fromString("elephant")));
+ assertEquals(8, fromString("hippo").levenshteinDistance(fromString("zzzzzzzz")));
+ assertEquals(1, fromString("hello").levenshteinDistance(fromString("hallo")));
+ assertEquals(4, fromString("世界千世").levenshteinDistance(fromString("千a世b")));
+ }
+
+ @Test
+ public void translate() {
+ assertEquals(
+ fromString("1a2s3ae"),
+ fromString("translate").translate(ImmutableMap.of(
+ 'r', '1',
+ 'n', '2',
+ 'l', '3',
+ 't', '\0'
+ )));
+ assertEquals(
+ fromString("translate"),
+ fromString("translate").translate(new HashMap<Character, Character>()));
+ assertEquals(
+ fromString("asae"),
+ fromString("translate").translate(ImmutableMap.of(
+ 'r', '\0',
+ 'n', '\0',
+ 'l', '\0',
+ 't', '\0'
+ )));
+ assertEquals(
+ fromString("aa世b"),
+ fromString("花花世界").translate(ImmutableMap.of(
+ '花', 'a',
+ '界', 'b'
+ )));
+ }
+
+ @Test
+ public void createBlankString() {
+ assertEquals(fromString(" "), blankString(1));
+ assertEquals(fromString(" "), blankString(2));
+ assertEquals(fromString(" "), blankString(3));
+ assertEquals(fromString(""), blankString(0));
+ }
+
+ @Test
+ public void findInSet() {
+ assertEquals(1, fromString("ab").findInSet(fromString("ab")));
+ assertEquals(2, fromString("a,b").findInSet(fromString("b")));
+ assertEquals(3, fromString("abc,b,ab,c,def").findInSet(fromString("ab")));
+ assertEquals(1, fromString("ab,abc,b,ab,c,def").findInSet(fromString("ab")));
+ assertEquals(4, fromString(",,,ab,abc,b,ab,c,def").findInSet(fromString("ab")));
+ assertEquals(1, fromString(",ab,abc,b,ab,c,def").findInSet(fromString("")));
+ assertEquals(4, fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("ab")));
+ assertEquals(6, fromString("数据砖头,abc,b,ab,c,def").findInSet(fromString("def")));
+ }
+
+ @Test
+ public void soundex() {
+ assertEquals(fromString("Robert").soundex(), fromString("R163"));
+ assertEquals(fromString("Rupert").soundex(), fromString("R163"));
+ assertEquals(fromString("Rubin").soundex(), fromString("R150"));
+ assertEquals(fromString("Ashcraft").soundex(), fromString("A261"));
+ assertEquals(fromString("Ashcroft").soundex(), fromString("A261"));
+ assertEquals(fromString("Burroughs").soundex(), fromString("B620"));
+ assertEquals(fromString("Burrows").soundex(), fromString("B620"));
+ assertEquals(fromString("Ekzampul").soundex(), fromString("E251"));
+ assertEquals(fromString("Example").soundex(), fromString("E251"));
+ assertEquals(fromString("Ellery").soundex(), fromString("E460"));
+ assertEquals(fromString("Euler").soundex(), fromString("E460"));
+ assertEquals(fromString("Ghosh").soundex(), fromString("G200"));
+ assertEquals(fromString("Gauss").soundex(), fromString("G200"));
+ assertEquals(fromString("Gutierrez").soundex(), fromString("G362"));
+ assertEquals(fromString("Heilbronn").soundex(), fromString("H416"));
+ assertEquals(fromString("Hilbert").soundex(), fromString("H416"));
+ assertEquals(fromString("Jackson").soundex(), fromString("J250"));
+ assertEquals(fromString("Kant").soundex(), fromString("K530"));
+ assertEquals(fromString("Knuth").soundex(), fromString("K530"));
+ assertEquals(fromString("Lee").soundex(), fromString("L000"));
+ assertEquals(fromString("Lukasiewicz").soundex(), fromString("L222"));
+ assertEquals(fromString("Lissajous").soundex(), fromString("L222"));
+ assertEquals(fromString("Ladd").soundex(), fromString("L300"));
+ assertEquals(fromString("Lloyd").soundex(), fromString("L300"));
+ assertEquals(fromString("Moses").soundex(), fromString("M220"));
+ assertEquals(fromString("O'Hara").soundex(), fromString("O600"));
+ assertEquals(fromString("Pfister").soundex(), fromString("P236"));
+ assertEquals(fromString("Rubin").soundex(), fromString("R150"));
+ assertEquals(fromString("Robert").soundex(), fromString("R163"));
+ assertEquals(fromString("Rupert").soundex(), fromString("R163"));
+ assertEquals(fromString("Soundex").soundex(), fromString("S532"));
+ assertEquals(fromString("Sownteks").soundex(), fromString("S532"));
+ assertEquals(fromString("Tymczak").soundex(), fromString("T522"));
+ assertEquals(fromString("VanDeusen").soundex(), fromString("V532"));
+ assertEquals(fromString("Washington").soundex(), fromString("W252"));
+ assertEquals(fromString("Wheaton").soundex(), fromString("W350"));
+
+ assertEquals(fromString("a").soundex(), fromString("A000"));
+ assertEquals(fromString("ab").soundex(), fromString("A100"));
+ assertEquals(fromString("abc").soundex(), fromString("A120"));
+ assertEquals(fromString("abcd").soundex(), fromString("A123"));
+ assertEquals(fromString("").soundex(), fromString(""));
+ assertEquals(fromString("123").soundex(), fromString("123"));
+ assertEquals(fromString("世界千世").soundex(), fromString("世界千世"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
----------------------------------------------------------------------
diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
new file mode 100644
index 0000000..b3bbd68
--- /dev/null
+++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
@@ -0,0 +1,248 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.unsafe.types
+
+import org.apache.commons.lang3.StringUtils
+import org.scalacheck.{Arbitrary, Gen}
+import org.scalatest.prop.GeneratorDrivenPropertyChecks
+// scalastyle:off
+import org.scalatest.{FunSuite, Matchers}
+
+import org.apache.spark.unsafe.types.UTF8String.{fromString => toUTF8}
+
+/**
+ * This TestSuite utilize ScalaCheck to generate randomized inputs for UTF8String testing.
+ */
+class UTF8StringPropertyCheckSuite extends FunSuite with GeneratorDrivenPropertyChecks with Matchers {
+// scalastyle:on
+
+ test("toString") {
+ forAll { (s: String) =>
+ assert(toUTF8(s).toString() === s)
+ }
+ }
+
+ test("numChars") {
+ forAll { (s: String) =>
+ assert(toUTF8(s).numChars() === s.length)
+ }
+ }
+
+ test("startsWith") {
+ forAll { (s: String) =>
+ val utf8 = toUTF8(s)
+ assert(utf8.startsWith(utf8))
+ for (i <- 1 to s.length) {
+ assert(utf8.startsWith(toUTF8(s.dropRight(i))))
+ }
+ }
+ }
+
+ test("endsWith") {
+ forAll { (s: String) =>
+ val utf8 = toUTF8(s)
+ assert(utf8.endsWith(utf8))
+ for (i <- 1 to s.length) {
+ assert(utf8.endsWith(toUTF8(s.drop(i))))
+ }
+ }
+ }
+
+ test("toUpperCase") {
+ forAll { (s: String) =>
+ assert(toUTF8(s).toUpperCase === toUTF8(s.toUpperCase))
+ }
+ }
+
+ test("toLowerCase") {
+ forAll { (s: String) =>
+ assert(toUTF8(s).toLowerCase === toUTF8(s.toLowerCase))
+ }
+ }
+
+ test("compare") {
+ forAll { (s1: String, s2: String) =>
+ assert(Math.signum(toUTF8(s1).compareTo(toUTF8(s2))) === Math.signum(s1.compareTo(s2)))
+ }
+ }
+
+ test("substring") {
+ forAll { (s: String) =>
+ for (start <- 0 to s.length; end <- 0 to s.length; if start <= end) {
+ assert(toUTF8(s).substring(start, end).toString === s.substring(start, end))
+ }
+ }
+ }
+
+ test("contains") {
+ forAll { (s: String) =>
+ for (start <- 0 to s.length; end <- 0 to s.length; if start <= end) {
+ val substring = s.substring(start, end)
+ assert(toUTF8(s).contains(toUTF8(substring)) === s.contains(substring))
+ }
+ }
+ }
+
+ val whitespaceChar: Gen[Char] = Gen.choose(0x00, 0x20).map(_.toChar)
+ val whitespaceString: Gen[String] = Gen.listOf(whitespaceChar).map(_.mkString)
+ val randomString: Gen[String] = Arbitrary.arbString.arbitrary
+
+ test("trim, trimLeft, trimRight") {
+ // lTrim and rTrim are both modified from java.lang.String.trim
+ def lTrim(s: String): String = {
+ var st = 0
+ val array: Array[Char] = s.toCharArray
+ while ((st < s.length) && (array(st) <= ' ')) {
+ st += 1
+ }
+ if (st > 0) s.substring(st, s.length) else s
+ }
+ def rTrim(s: String): String = {
+ var len = s.length
+ val array: Array[Char] = s.toCharArray
+ while ((len > 0) && (array(len - 1) <= ' ')) {
+ len -= 1
+ }
+ if (len < s.length) s.substring(0, len) else s
+ }
+
+ forAll(
+ whitespaceString,
+ randomString,
+ whitespaceString
+ ) { (start: String, middle: String, end: String) =>
+ val s = start + middle + end
+ assert(toUTF8(s).trim() === toUTF8(s.trim()))
+ assert(toUTF8(s).trimLeft() === toUTF8(lTrim(s)))
+ assert(toUTF8(s).trimRight() === toUTF8(rTrim(s)))
+ }
+ }
+
+ test("reverse") {
+ forAll { (s: String) =>
+ assert(toUTF8(s).reverse === toUTF8(s.reverse))
+ }
+ }
+
+ test("indexOf") {
+ forAll { (s: String) =>
+ for (start <- 0 to s.length; end <- 0 to s.length; if start <= end) {
+ val substring = s.substring(start, end)
+ assert(toUTF8(s).indexOf(toUTF8(substring), 0) === s.indexOf(substring))
+ }
+ }
+ }
+
+ val randomInt = Gen.choose(-100, 100)
+
+ test("repeat") {
+ def repeat(str: String, times: Int): String = {
+ if (times > 0) str * times else ""
+ }
+ // ScalaCheck always generating too large repeat times which might hang the test forever.
+ forAll(randomString, randomInt) { (s: String, times: Int) =>
+ assert(toUTF8(s).repeat(times) === toUTF8(repeat(s, times)))
+ }
+ }
+
+ test("lpad, rpad") {
+ def padding(origin: String, pad: String, length: Int, isLPad: Boolean): String = {
+ if (length <= 0) return ""
+ if (length <= origin.length) {
+ if (length <= 0) "" else origin.substring(0, length)
+ } else {
+ if (pad.length == 0) return origin
+ val toPad = length - origin.length
+ val partPad = if (toPad % pad.length == 0) "" else pad.substring(0, toPad % pad.length)
+ if (isLPad) {
+ pad * (toPad / pad.length) + partPad + origin
+ } else {
+ origin + pad * (toPad / pad.length) + partPad
+ }
+ }
+ }
+
+ forAll (
+ randomString,
+ randomString,
+ randomInt
+ ) { (s: String, pad: String, length: Int) =>
+ assert(toUTF8(s).lpad(length, toUTF8(pad)) ===
+ toUTF8(padding(s, pad, length, true)))
+ assert(toUTF8(s).rpad(length, toUTF8(pad)) ===
+ toUTF8(padding(s, pad, length, false)))
+ }
+ }
+
+ val nullalbeSeq = Gen.listOf(Gen.oneOf[String](null: String, randomString))
+
+ test("concat") {
+ def concat(orgin: Seq[String]): String =
+ if (orgin.exists(_ == null)) null else orgin.mkString
+
+ forAll { (inputs: Seq[String]) =>
+ assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(inputs.mkString))
+ }
+ forAll (nullalbeSeq) { (inputs: Seq[String]) =>
+ assert(UTF8String.concat(inputs.map(toUTF8): _*) === toUTF8(concat(inputs)))
+ }
+ }
+
+ test("concatWs") {
+ def concatWs(sep: String, inputs: Seq[String]): String = {
+ if (sep == null) return null
+ inputs.filter(_ != null).mkString(sep)
+ }
+
+ forAll { (sep: String, inputs: Seq[String]) =>
+ assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) ===
+ toUTF8(inputs.mkString(sep)))
+ }
+ forAll(randomString, nullalbeSeq) {(sep: String, inputs: Seq[String]) =>
+ assert(UTF8String.concatWs(toUTF8(sep), inputs.map(toUTF8): _*) ===
+ toUTF8(concatWs(sep, inputs)))
+ }
+ }
+
+ // TODO: enable this when we find a proper way to generate valid patterns
+ ignore("split") {
+ forAll { (s: String, pattern: String, limit: Int) =>
+ assert(toUTF8(s).split(toUTF8(pattern), limit) ===
+ s.split(pattern, limit).map(toUTF8(_)))
+ }
+ }
+
+ test("levenshteinDistance") {
+ forAll { (one: String, another: String) =>
+ assert(toUTF8(one).levenshteinDistance(toUTF8(another)) ===
+ StringUtils.getLevenshteinDistance(one, another))
+ }
+ }
+
+ test("hashCode") {
+ forAll { (s: String) =>
+ assert(toUTF8(s).hashCode() === toUTF8(s).hashCode())
+ }
+ }
+
+ test("equals") {
+ forAll { (one: String, another: String) =>
+ assert(toUTF8(one).equals(toUTF8(another)) === one.equals(another))
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 2376e30..2148379 100644
--- a/pom.xml
+++ b/pom.xml
@@ -89,7 +89,8 @@
<module>common/sketch</module>
<module>common/network-common</module>
<module>common/network-shuffle</module>
- <module>tags</module>
+ <module>common/unsafe</module>
+ <module>common/tags</module>
<module>core</module>
<module>graphx</module>
<module>mllib</module>
@@ -99,7 +100,6 @@
<module>sql/core</module>
<module>sql/hive</module>
<module>docker-integration-tests</module>
- <module>unsafe</module>
<module>assembly</module>
<module>external/twitter</module>
<module>external/flume</module>
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/tags/README.md
----------------------------------------------------------------------
diff --git a/tags/README.md b/tags/README.md
deleted file mode 100644
index 01e5126..0000000
--- a/tags/README.md
+++ /dev/null
@@ -1 +0,0 @@
-This module includes annotations in Java that are used to annotate test suites.
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/tags/pom.xml
----------------------------------------------------------------------
diff --git a/tags/pom.xml b/tags/pom.xml
deleted file mode 100644
index 3e8e6f6..0000000
--- a/tags/pom.xml
+++ /dev/null
@@ -1,50 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- ~ Licensed to the Apache Software Foundation (ASF) under one or more
- ~ contributor license agreements. See the NOTICE file distributed with
- ~ this work for additional information regarding copyright ownership.
- ~ The ASF licenses this file to You under the Apache License, Version 2.0
- ~ (the "License"); you may not use this file except in compliance with
- ~ the License. You may obtain a copy of the License at
- ~
- ~ http://www.apache.org/licenses/LICENSE-2.0
- ~
- ~ Unless required by applicable law or agreed to in writing, software
- ~ distributed under the License is distributed on an "AS IS" BASIS,
- ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- ~ See the License for the specific language governing permissions and
- ~ limitations under the License.
- -->
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-parent_2.11</artifactId>
- <version>2.0.0-SNAPSHOT</version>
- <relativePath>../pom.xml</relativePath>
- </parent>
-
- <groupId>org.apache.spark</groupId>
- <artifactId>spark-test-tags_2.11</artifactId>
- <packaging>jar</packaging>
- <name>Spark Project Test Tags</name>
- <url>http://spark.apache.org/</url>
- <properties>
- <sbt.project.name>test-tags</sbt.project.name>
- </properties>
-
- <dependencies>
- <dependency>
- <groupId>org.scalatest</groupId>
- <artifactId>scalatest_${scala.binary.version}</artifactId>
- <scope>compile</scope>
- </dependency>
- </dependencies>
-
- <build>
- <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
- <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
- </build>
-</project>
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/tags/src/main/java/org/apache/spark/tags/DockerTest.java
----------------------------------------------------------------------
diff --git a/tags/src/main/java/org/apache/spark/tags/DockerTest.java b/tags/src/main/java/org/apache/spark/tags/DockerTest.java
deleted file mode 100644
index 0fecf3b..0000000
--- a/tags/src/main/java/org/apache/spark/tags/DockerTest.java
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.tags;
-
-import java.lang.annotation.*;
-import org.scalatest.TagAnnotation;
-
-@TagAnnotation
-@Retention(RetentionPolicy.RUNTIME)
-@Target({ElementType.METHOD, ElementType.TYPE})
-public @interface DockerTest { }
http://git-wip-us.apache.org/repos/asf/spark/blob/b0ee7d43/tags/src/main/java/org/apache/spark/tags/ExtendedHiveTest.java
----------------------------------------------------------------------
diff --git a/tags/src/main/java/org/apache/spark/tags/ExtendedHiveTest.java b/tags/src/main/java/org/apache/spark/tags/ExtendedHiveTest.java
deleted file mode 100644
index 83279e5..0000000
--- a/tags/src/main/java/org/apache/spark/tags/ExtendedHiveTest.java
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.tags;
-
-import java.lang.annotation.*;
-
-import org.scalatest.TagAnnotation;
-
-@TagAnnotation
-@Retention(RetentionPolicy.RUNTIME)
-@Target({ElementType.METHOD, ElementType.TYPE})
-public @interface ExtendedHiveTest { }
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org