You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/04/25 09:26:23 UTC
svn commit: r1475653 - in /hive/branches/vectorization/ql/src:
java/org/apache/hadoop/hive/ql/exec/vector/expressions/
java/org/apache/hadoop/hive/ql/udf/
test/org/apache/hadoop/hive/ql/exec/vector/expressions/
Author: hashutosh
Date: Thu Apr 25 07:26:22 2013
New Revision: 1475653
URL: http://svn.apache.org/r1475653
Log:
HIVE-4160 : Implement vectorized string functions UPPER(), LOWER(), LENGTH() (Eric Hanson via Ashutosh Chauhan)
Added:
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java
hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
Modified:
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+/**
+ * String expression evaluation helper functions
+ */
+public class StringExpr {
+
+ /* Compare two strings from two byte arrays each
+ * with their own start position and length.
+ * Use lexicographic unsigned byte value order.
+ * This is what's used for UTF-8 sort order.
+ * Return negative value if arg1 < arg2, 0 if arg1 = arg2,
+ * positive if arg1 > arg2.
+ */
+ public static int compare(byte[] arg1, int start1, int len1, byte[] arg2, int start2, int len2) {
+ for (int i = 0; i < len1 && i < len2; i++) {
+ int b1 = arg1[i + start1] & 0xff;
+ int b2 = arg2[i + start2] & 0xff;
+ if (b1 != b2) {
+ return b1 - b2;
+ }
+ }
+ return len1 - len2;
+ }
+}
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+public class StringLength extends VectorExpression {
+ private int colNum;
+ private int outputColumn;
+
+ StringLength (int colNum, int outputColumn) {
+ this.colNum = colNum;
+ this.outputColumn = outputColumn;
+ }
+
+ // Calculate the length of the UTF-8 strings in input vector and place results in output vector.
+ @Override
+ public void evaluate(VectorizedRowBatch batch) {
+ BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
+ LongColumnVector outV = (LongColumnVector) batch.cols[outputColumn];
+ int[] sel = batch.selected;
+ int n = batch.size;
+ byte[][] vector = inputColVector.vector;
+ int start[] = inputColVector.start;
+ int length[] = inputColVector.length;
+ long[] resultLen = outV.vector;
+
+ if (n == 0) {
+
+ //Nothing to do
+ return;
+ }
+
+ if (inputColVector.noNulls) {
+ outV.noNulls = true;
+ if (inputColVector.isRepeating) {
+ outV.isRepeating = true;
+ resultLen[0] = UTF8StringLength(vector[0], start[0], length[0]);
+ } else if (batch.selectedInUse) {
+ for(int j = 0; j != n; j++) {
+ int i = sel[j];
+ resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]);
+ }
+ outV.isRepeating = false;
+ } else {
+ for(int i = 0; i != n; i++) {
+ resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]);
+ }
+ outV.isRepeating = false;
+ }
+ } else {
+
+ /*
+ * Handle case with nulls. Don't do function if the value is null, to save time,
+ * because calling the function can be expensive.
+ */
+ outV.noNulls = false;
+ if (inputColVector.isRepeating) {
+ outV.isRepeating = true;
+ outV.isNull[0] = inputColVector.isNull[0];
+ if (!inputColVector.isNull[0]) {
+ resultLen[0] = UTF8StringLength(vector[0], start[0], length[0]);
+ }
+ } else if (batch.selectedInUse) {
+ for(int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (!inputColVector.isNull[i]) {
+ resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]);
+ }
+ outV.isNull[i] = inputColVector.isNull[i];
+ }
+ outV.isRepeating = false;
+ } else {
+ for(int i = 0; i != n; i++) {
+ if (!inputColVector.isNull[i]) {
+ resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]);
+ }
+ outV.isNull[i] = inputColVector.isNull[i];
+ }
+ outV.isRepeating = false;
+ }
+ }
+ }
+
+ /*
+ * Return length in characters of UTF8 string in byte array
+ * beginning at start that is len bytes long.
+ */
+ static long UTF8StringLength(byte[] s, int start, int len)
+ {
+ long resultLength = 0;
+ for (int i = start; i < start + len; i++) {
+
+ /* Byte bit patterns of the form 10xxxxxx are continuation
+ * bytes. All other bit patterns are the first byte of
+ * a character.
+ */
+ if ((s[i] & 0xc0) != 0x80) {
+ resultLength++;
+ }
+ }
+ return resultLength;
+ }
+
+ @Override
+ public int getOutputColumn() {
+ return outputColumn;
+ }
+
+ @Override
+ public String getOutputType() {
+ return "String";
+ }
+
+
+}
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.udf.UDFLower;
+import org.apache.hadoop.hive.ql.udf.IUDFUnaryString;
+
+public class StringLower extends StringUnaryUDF {
+ StringLower(int colNum, int outputColumn) {
+ super(colNum, outputColumn, (IUDFUnaryString) new UDFLower());
+ }
+}
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.hive.ql.udf.IUDFUnaryString;
+
+public class StringUnaryUDF extends VectorExpression {
+
+ int colNum;
+ int outputColumn;
+ IUDFUnaryString func;
+ Text s;
+
+ StringUnaryUDF (int colNum, int outputColumn, IUDFUnaryString func) {
+ this.colNum = colNum;
+ this.outputColumn = outputColumn;
+ this.func = func;
+ s = new Text();
+ }
+
+ @Override
+ public void evaluate(VectorizedRowBatch batch) {
+ BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
+ int[] sel = batch.selected;
+ int n = batch.size;
+ byte[][] vector = inputColVector.vector;
+ int start[] = inputColVector.start;
+ int length[] = inputColVector.length;
+ BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn];
+ byte[][] outputVector = outV.vector;
+ Text t;
+
+ if (n == 0) {
+ //Nothing to do
+ return;
+ }
+
+ // Design Note: In the future, if this function can be implemented
+ // directly to translate input to output without creating new
+ // objects, performance can probably be improved significantly.
+ // It's implemented in the simplest way now, just calling the
+ // existing built-in function.
+
+ if (inputColVector.noNulls) {
+ outV.noNulls = true;
+ if (inputColVector.isRepeating) {
+ outV.isRepeating = true;
+ s.set(vector[0], start[0], length[0]);
+ t = func.evaluate(s);
+ outV.setRef(0, t.getBytes(), 0, t.getLength());
+ } else if (batch.selectedInUse) {
+ for(int j=0; j != n; j++) {
+ int i = sel[j];
+ s.set(vector[i], start[i], length[i]);
+ t = func.evaluate(s);
+ outV.setRef(i, t.getBytes(), 0, t.getLength());
+ }
+ outV.isRepeating = false;
+ } else {
+ for(int i = 0; i != n; i++) {
+ s.set(vector[i], start[i], length[i]);
+ t = func.evaluate(s);
+ outV.setRef(i, t.getBytes(), 0, t.getLength());
+ }
+ outV.isRepeating = false;
+ }
+ } else {
+ // Handle case with nulls. Don't do function if the value is null, to save time,
+ // because calling the function can be expensive.
+ outV.noNulls = false;
+ if (inputColVector.isRepeating) {
+ outV.isRepeating = true;
+ outV.isNull[0] = inputColVector.isNull[0];
+ if (!inputColVector.isNull[0]) {
+ s.set(vector[0], start[0], length[0]);
+ t = func.evaluate(s);
+ outV.setRef(0, t.getBytes(), 0, t.getLength());
+ }
+ } else if (batch.selectedInUse) {
+ for(int j=0; j != n; j++) {
+ int i = sel[j];
+ if (!inputColVector.isNull[i]) {
+ s.set(vector[i], start[i], length[i]);
+ t = func.evaluate(s);
+ outV.setRef(i, t.getBytes(), 0, t.getLength());
+ }
+ outV.isNull[i] = inputColVector.isNull[i];
+ }
+ outV.isRepeating = false;
+ } else {
+ for(int i = 0; i != n; i++) {
+ if (!inputColVector.isNull[i]) {
+ s.set(vector[i], start[i], length[i]);
+ t = func.evaluate(s);
+ outV.setRef(i, t.getBytes(), 0, t.getLength());
+ }
+ outV.isNull[i] = inputColVector.isNull[i];
+ }
+ outV.isRepeating = false;
+ }
+ }
+ }
+
+ @Override
+ public int getOutputColumn() {
+ return outputColumn;
+ }
+
+ @Override
+ public String getOutputType() {
+ return "String";
+ }
+
+
+}
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.udf.UDFUpper;
+import org.apache.hadoop.hive.ql.udf.IUDFUnaryString;
+
+public class StringUpper extends StringUnaryUDF {
+ StringUpper(int colNum, int outputColumn) {
+ super(colNum, outputColumn, (IUDFUnaryString) new UDFUpper());
+ }
+}
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,10 @@
+package org.apache.hadoop.hive.ql.udf;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * Interface to support use of standard UDFs inside the vectorized execution code path.
+ */
+public interface IUDFUnaryString {
+ Text evaluate(Text s);
+}
Modified: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java?rev=1475653&r1=1475652&r2=1475653&view=diff
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java (original)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java Thu Apr 25 07:26:22 2013
@@ -30,7 +30,7 @@ import org.apache.hadoop.io.Text;
value = "_FUNC_(str) - Returns str with all characters changed to lowercase",
extended = "Example:\n"
+ " > SELECT _FUNC_('Facebook') FROM src LIMIT 1;\n" + " 'facebook'")
-public class UDFLower extends UDF {
+public class UDFLower extends UDF implements IUDFUnaryString {
private Text t = new Text();
public UDFLower() {
Modified: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java?rev=1475653&r1=1475652&r2=1475653&view=diff
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java (original)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java Thu Apr 25 07:26:22 2013
@@ -30,7 +30,7 @@ import org.apache.hadoop.io.Text;
value = "_FUNC_(str) - Returns str with all characters changed to uppercase",
extended = "Example:\n"
+ " > SELECT _FUNC_('Facebook') FROM src LIMIT 1;\n" + " 'FACEBOOK'")
-public class UDFUpper extends UDF {
+public class UDFUpper extends UDF implements IUDFUnaryString {
Text t = new Text();
Added: hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java (added)
+++ hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,286 @@
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import junit.framework.Assert;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.Test;
+
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import org.apache.hadoop.io.Text;
+
+public class TestVectorStringExpressions {
+
+ static byte[] red;
+ static byte[] red2; // second copy of red, different object
+ static byte[] green;
+ static byte[] emptyString;
+ static byte[] mixedUp;
+ static byte[] mixedUpLower;
+ static byte[] mixedUpUpper;
+ static byte[] multiByte;
+ static byte[] mixPercentPattern;
+
+ static {
+ try {
+ red = "red".getBytes("UTF-8");
+ green = "green".getBytes("UTF-8");
+ emptyString = "".getBytes("UTF-8");
+ mixedUp = "mixedUp".getBytes("UTF-8");
+ mixedUpLower = "mixedup".getBytes("UTF-8");
+ mixedUpUpper = "MIXEDUP".getBytes("UTF-8");
+ mixPercentPattern = "mix%".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE
+ multiByte = new byte[100];
+ addMultiByteChars(multiByte);
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ red2 = new byte[red.length];
+ System.arraycopy(red, 0, red2, 0, red.length);
+ }
+
+ // add some multi-byte characters to test length routine later.
+ // total characters = 4; byte length = 10
+ static void addMultiByteChars(byte[] b) {
+ int i = 0;
+ b[i++] = (byte) 0x41; // letter "A" (1 byte)
+ b[i++] = (byte) 0xC3; // Latin capital A with grave (2 bytes)
+ b[i++] = (byte) 0x80;
+ b[i++] = (byte) 0xE2; // Euro sign (3 bytes)
+ b[i++] = (byte) 0x82;
+ b[i++] = (byte) 0xAC;
+ b[i++] = (byte) 0xF0; // Asian character U+24B62 (4 bytes)
+ b[i++] = (byte) 0xA4;
+ b[i++] = (byte) 0xAD;
+ b[i++] = (byte) 0xA2;
+ }
+
+ @Test
+ // Load a BytesColumnVector by copying in large data, enough to force
+ // the buffer to expand.
+ public void testLoadBytesColumnVectorByValueLargeData() {
+ BytesColumnVector bcv = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+ bcv.initBuffer(10); // initialize with estimated element size 10
+ String s = "0123456789";
+ while (s.length() < 500) {
+ s += s;
+ }
+ byte[] b = null;
+ try {
+ b = s.getBytes("UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) {
+ bcv.setVal(i, b, 0, b.length);
+ }
+ Assert.assertTrue(bcv.bufferSize() >= b.length * VectorizedRowBatch.DEFAULT_SIZE);
+ }
+
+ @Test
+ // set values by reference, copy the data out, and verify equality
+ public void testLoadBytesColumnVectorByRef() {
+ BytesColumnVector bcv = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+ String s = "red";
+ byte[] b = null;
+ try {
+ b = s.getBytes("UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) {
+ bcv.setRef(i, b, 0, b.length);
+ }
+ // verify
+ byte[] v = new byte[b.length];
+ for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) {
+ Assert.assertTrue(bcv.length[i] == b.length);
+ System.arraycopy(bcv.vector[i], bcv.start[i], v, 0, b.length);
+ Assert.assertTrue(Arrays.equals(b, v));
+ }
+ }
+
+ VectorizedRowBatch makeStringBatch() {
+ // create a batch with one string ("Bytes") column
+ VectorizedRowBatch batch = new VectorizedRowBatch(1,VectorizedRowBatch.DEFAULT_SIZE);
+ BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+ batch.cols[0] = v;
+ /*
+ * Add these 3 values:
+ *
+ * red
+ * green
+ * NULL
+ */
+ v.setRef(0, red, 0, red.length);
+ v.isNull[0] = false;
+ v.setRef(1, green, 0, green.length);
+ v.isNull[1] = false;
+ v.setRef(2, emptyString, 0, emptyString.length);
+ v.isNull[2] = true;
+
+ v.noNulls = false;
+
+ batch.size = 3;
+ return batch;
+ }
+
+ VectorizedRowBatch makeStringBatchMixedCase() {
+ // create a batch with two string ("Bytes") columns
+ VectorizedRowBatch batch = new VectorizedRowBatch(2,VectorizedRowBatch.DEFAULT_SIZE);
+ BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+ batch.cols[0] = v;
+ BytesColumnVector outV = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+ batch.cols[1] = outV;
+ /*
+ * Add these 3 values:
+ *
+ * mixedUp
+ * green
+ * NULL
+ */
+ v.setRef(0, mixedUp, 0, mixedUp.length);
+ v.isNull[0] = false;
+ v.setRef(1, green, 0, green.length);
+ v.isNull[1] = false;
+ v.setRef(2, emptyString, 0, emptyString.length);
+ v.isNull[2] = true;
+ v.noNulls = false;
+
+ batch.size = 3;
+ return batch;
+ }
+
+ VectorizedRowBatch makeStringBatchMixedCharSize() {
+ // create a new batch with one char column (for input)
+ // and one long column (for output)
+ VectorizedRowBatch batch = new VectorizedRowBatch(2,VectorizedRowBatch.DEFAULT_SIZE);
+ BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+ batch.cols[0] = v;
+ LongColumnVector outV = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+ batch.cols[1] = outV;
+
+ /*
+ * Add these 3 values:
+ *
+ * mixedUp
+ * green
+ * NULL
+ * <4 char string with mult-byte chars>
+ */
+ v.setRef(0, mixedUp, 0, mixedUp.length);
+ v.isNull[0] = false;
+ v.setRef(1, green, 0, green.length);
+ v.isNull[1] = false;
+ v.setRef(2, emptyString, 0, emptyString.length);
+ v.isNull[2] = true;
+ v.noNulls = false;
+ v.setRef(3, multiByte, 0, 10);
+ v.isNull[3] = false;
+
+ batch.size = 4;
+ return batch;
+ }
+
+ @Test
+ public void testColLower() {
+ // has nulls, not repeating
+ VectorizedRowBatch batch = makeStringBatchMixedCase();
+ StringLower expr = new StringLower(0,1);
+ expr.evaluate(batch);
+ BytesColumnVector outCol = (BytesColumnVector) batch.cols[1];
+ int cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]);
+ Assert.assertEquals(0,cmp);
+ Assert.assertTrue(outCol.isNull[2]);
+ int cmp2 = StringExpr.compare(green, 0, green.length, outCol.vector[1], outCol.start[1], outCol.length[1]);
+ Assert.assertEquals(0,cmp2);
+
+ // no nulls, not repeating
+ batch = makeStringBatchMixedCase();
+ batch.cols[0].noNulls = true;
+ expr.evaluate(batch);
+ outCol = (BytesColumnVector) batch.cols[1];
+ cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]);
+ Assert.assertEquals(0,cmp);
+ Assert.assertTrue(outCol.noNulls);
+
+ // has nulls, is repeating
+ batch = makeStringBatchMixedCase();
+ batch.cols[0].isRepeating = true;
+ expr.evaluate(batch);
+ outCol = (BytesColumnVector) batch.cols[1];
+ cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]);
+ Assert.assertEquals(0,cmp);
+ Assert.assertTrue(outCol.isRepeating);
+ Assert.assertFalse(outCol.noNulls);
+
+ // no nulls, is repeating
+ batch = makeStringBatchMixedCase();
+ batch.cols[0].isRepeating = true;
+ batch.cols[0].noNulls = true;
+ expr.evaluate(batch);
+ outCol = (BytesColumnVector) batch.cols[1];
+ cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]);
+ Assert.assertEquals(0,cmp);
+ Assert.assertTrue(outCol.isRepeating);
+ Assert.assertTrue(outCol.noNulls);
+ }
+
+ @Test
+ public void testColUpper() {
+ // no nulls, not repeating
+
+ // We don't test all the combinations because (at least currently)
+ // the logic is inherited to be the same as testColLower, which checks all the cases).
+ VectorizedRowBatch batch = makeStringBatchMixedCase();
+ StringUpper expr = new StringUpper(0,1);
+ batch.cols[0].noNulls = true;
+ expr.evaluate(batch);
+ BytesColumnVector outCol = (BytesColumnVector) batch.cols[1];
+ int cmp = StringExpr.compare(mixedUpUpper, 0, mixedUpUpper.length, outCol.vector[0], outCol.start[0], outCol.length[0]);
+ Assert.assertEquals(0,cmp);
+ Assert.assertTrue(outCol.noNulls);
+ }
+
+ @Test
+ public void testStringLength() {
+
+ // has nulls, not repeating
+ VectorizedRowBatch batch = makeStringBatchMixedCharSize();
+ StringLength expr = new StringLength(0,1);
+ expr.evaluate(batch);
+ LongColumnVector outCol = (LongColumnVector) batch.cols[1];
+ Assert.assertEquals(5,outCol.vector[1]); // length of green is 5
+ Assert.assertTrue(outCol.isNull[2]);
+ Assert.assertEquals(4,outCol.vector[3]); // this one has the mixed-size chars
+
+ // no nulls, not repeating
+ batch = makeStringBatchMixedCharSize();
+ batch.cols[0].noNulls = true;
+ expr.evaluate(batch);
+ outCol = (LongColumnVector) batch.cols[1];
+ Assert.assertTrue(outCol.noNulls);
+ Assert.assertEquals(4,outCol.vector[3]); // this one has the mixed-size chars
+
+ // has nulls, is repeating
+ batch = makeStringBatchMixedCharSize();
+ batch.cols[0].isRepeating = true;
+ expr.evaluate(batch);
+ outCol = (LongColumnVector) batch.cols[1];
+ Assert.assertTrue(outCol.isRepeating);
+ Assert.assertFalse(outCol.noNulls);
+ Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp"
+
+ // no nulls, is repeating
+ batch = makeStringBatchMixedCharSize();
+ batch.cols[0].isRepeating = true;
+ batch.cols[0].noNulls = true;
+ expr.evaluate(batch);
+ outCol = (LongColumnVector) batch.cols[1];
+ Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp"
+ Assert.assertTrue(outCol.isRepeating);
+ Assert.assertTrue(outCol.noNulls);
+ }
+}