You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/04/25 09:26:23 UTC
svn commit: r1475653 - in /hive/branches/vectorization/ql/src: java/org/apache/hadoop/hive/ql/exec/vector/expressions/ java/org/apache/hadoop/hive/ql/udf/ test/org/apache/hadoop/hive/ql/exec/vector/expressions/

Author: hashutosh
Date: Thu Apr 25 07:26:22 2013
New Revision: 1475653

URL: http://svn.apache.org/r1475653
Log:
HIVE-4160 : Implement vectorized string functions UPPER(), LOWER(), LENGTH() (Eric Hanson via Ashutosh Chauhan)

Added:
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java
    hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
Modified:
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringExpr.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+/** 
+ * String expression evaluation helper functions
+ */
+public class StringExpr {
+  
+  /* Compare two strings from two byte arrays each 
+   * with their own start position and length. 
+   * Use lexicographic unsigned byte value order. 
+   * This is what's used for UTF-8 sort order.
+   * Return negative value if arg1 < arg2, 0 if arg1 = arg2, 
+   * positive if arg1 > arg2.
+   */
+  public static int compare(byte[] arg1, int start1, int len1, byte[] arg2, int start2, int len2) {
+    for (int i = 0; i < len1 && i < len2; i++) {
+      int b1 = arg1[i + start1] & 0xff; 
+      int b2 = arg2[i + start2] & 0xff;
+      if (b1 != b2) {
+        return b1 - b2;
+      }
+    }
+    return len1 - len2;
+  }
+}

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLength.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+public class StringLength extends VectorExpression {
+  private int colNum;
+  private int outputColumn;
+  
+  StringLength (int colNum, int outputColumn) {
+    this.colNum = colNum;
+    this.outputColumn = outputColumn;
+  }
+
+  // Calculate the length of the UTF-8 strings in input vector and place results in output vector.
+  @Override
+  public void evaluate(VectorizedRowBatch batch) {
+    BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
+    LongColumnVector outV = (LongColumnVector) batch.cols[outputColumn];  
+    int[] sel = batch.selected;
+    int n = batch.size;
+    byte[][] vector = inputColVector.vector;
+    int start[] = inputColVector.start;
+    int length[] = inputColVector.length;
+    long[] resultLen = outV.vector; 
+    
+    if (n == 0) {
+      
+      //Nothing to do
+      return;
+    }
+    
+    if (inputColVector.noNulls) {
+      outV.noNulls = true;
+      if (inputColVector.isRepeating) {
+        outV.isRepeating = true;
+        resultLen[0] = UTF8StringLength(vector[0], start[0], length[0]);
+      } else if (batch.selectedInUse) {
+        for(int j = 0; j != n; j++) {
+          int i = sel[j];
+          resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]);
+        }
+        outV.isRepeating = false;
+      } else {
+        for(int i = 0; i != n; i++) {
+          resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]);
+        }
+        outV.isRepeating = false;
+      }
+    } else {
+      
+      /*
+       * Handle case with nulls. Don't do function if the value is null, to save time,
+       * because calling the function can be expensive.
+       */
+      outV.noNulls = false;
+      if (inputColVector.isRepeating) {
+        outV.isRepeating = true;
+        outV.isNull[0] = inputColVector.isNull[0];
+        if (!inputColVector.isNull[0]) {
+          resultLen[0] = UTF8StringLength(vector[0], start[0], length[0]);
+        }
+      } else if (batch.selectedInUse) {
+        for(int j = 0; j != n; j++) {
+          int i = sel[j];
+          if (!inputColVector.isNull[i]) {
+            resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]);
+          }
+          outV.isNull[i] = inputColVector.isNull[i];
+        }
+        outV.isRepeating = false;
+      } else {
+        for(int i = 0; i != n; i++) {
+          if (!inputColVector.isNull[i]) {
+            resultLen[i] = UTF8StringLength(vector[i], start[i], length[i]);
+          }
+          outV.isNull[i] = inputColVector.isNull[i];
+        }
+        outV.isRepeating = false;
+      }
+    }
+  }
+  
+  /* 
+   * Return length in characters of UTF8 string in byte array
+   * beginning at start that is len bytes long.
+   */
+  static long UTF8StringLength(byte[] s, int start, int len)
+  {
+    long resultLength = 0;
+    for (int i = start; i < start + len; i++) {
+      
+      /* Byte bit patterns of the form 10xxxxxx are continuation 
+       * bytes. All other bit patterns are the first byte of 
+       * a character.
+       */
+      if ((s[i] & 0xc0) != 0x80) {
+        resultLength++;  
+      }
+    }
+    return resultLength;
+  }
+
+  @Override
+  public int getOutputColumn() {
+    return outputColumn;
+  }
+
+  @Override
+  public String getOutputType() {
+    return "String";
+  }
+
+
+}

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringLower.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.udf.UDFLower;
+import org.apache.hadoop.hive.ql.udf.IUDFUnaryString;
+
+public class StringLower extends StringUnaryUDF {
+  StringLower(int colNum, int outputColumn) {
+    super(colNum, outputColumn, (IUDFUnaryString) new UDFLower());
+  }
+}

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUnaryUDF.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.hive.ql.udf.IUDFUnaryString;
+
+public class StringUnaryUDF extends VectorExpression {
+  
+  int colNum;
+  int outputColumn;
+  IUDFUnaryString func;
+  Text s;
+  
+  StringUnaryUDF (int colNum, int outputColumn, IUDFUnaryString func) {
+    this.colNum = colNum;
+    this.outputColumn = outputColumn;
+    this.func = func;
+    s = new Text();
+  }
+
+  @Override
+  public void evaluate(VectorizedRowBatch batch) {
+    BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
+    int[] sel = batch.selected;
+    int n = batch.size;
+    byte[][] vector = inputColVector.vector;
+    int start[] = inputColVector.start;
+    int length[] = inputColVector.length;
+    BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn];
+    byte[][] outputVector = outV.vector;
+    Text t;
+    
+    if (n == 0) {
+      //Nothing to do
+      return;
+    }
+    
+    // Design Note: In the future, if this function can be implemented
+    // directly to translate input to output without creating new
+    // objects, performance can probably be improved significantly.
+    // It's implemented in the simplest way now, just calling the 
+    // existing built-in function.
+
+    if (inputColVector.noNulls) {
+      outV.noNulls = true;
+      if (inputColVector.isRepeating) {
+        outV.isRepeating = true;
+        s.set(vector[0], start[0], length[0]); 
+        t = func.evaluate(s);
+        outV.setRef(0, t.getBytes(), 0, t.getLength());
+      } else if (batch.selectedInUse) {
+        for(int j=0; j != n; j++) {
+          int i = sel[j];
+          s.set(vector[i], start[i], length[i]); 
+          t = func.evaluate(s);
+          outV.setRef(i, t.getBytes(), 0, t.getLength());
+        }
+        outV.isRepeating = false;
+      } else {
+        for(int i = 0; i != n; i++) {
+          s.set(vector[i], start[i], length[i]); 
+          t = func.evaluate(s);
+          outV.setRef(i, t.getBytes(), 0, t.getLength());
+        }
+        outV.isRepeating = false;
+      }
+    } else { 
+      // Handle case with nulls. Don't do function if the value is null, to save time,
+      // because calling the function can be expensive.
+      outV.noNulls = false;
+      if (inputColVector.isRepeating) {
+        outV.isRepeating = true;
+        outV.isNull[0] = inputColVector.isNull[0];
+        if (!inputColVector.isNull[0]) {
+          s.set(vector[0], start[0], length[0]); 
+          t = func.evaluate(s);
+          outV.setRef(0, t.getBytes(), 0, t.getLength());
+        }
+      } else if (batch.selectedInUse) {
+        for(int j=0; j != n; j++) {
+          int i = sel[j];
+          if (!inputColVector.isNull[i]) {
+            s.set(vector[i], start[i], length[i]); 
+            t = func.evaluate(s);
+            outV.setRef(i, t.getBytes(), 0, t.getLength());
+          }
+          outV.isNull[i] = inputColVector.isNull[i];
+        }
+        outV.isRepeating = false;
+      } else {
+        for(int i = 0; i != n; i++) {
+          if (!inputColVector.isNull[i]) {
+            s.set(vector[i], start[i], length[i]); 
+            t = func.evaluate(s);
+            outV.setRef(i, t.getBytes(), 0, t.getLength());
+          }
+          outV.isNull[i] = inputColVector.isNull[i];
+        }
+        outV.isRepeating = false;
+      }
+    }
+  }
+
+  @Override
+  public int getOutputColumn() {
+    return outputColumn;
+  }
+
+  @Override
+  public String getOutputType() {
+    return "String";
+  }
+
+
+}

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringUpper.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.udf.UDFUpper;
+import org.apache.hadoop.hive.ql.udf.IUDFUnaryString;
+
+public class StringUpper extends StringUnaryUDF {
+  StringUpper(int colNum, int outputColumn) {
+    super(colNum, outputColumn, (IUDFUnaryString) new UDFUpper());
+  }
+}

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/IUDFUnaryString.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,10 @@
+package org.apache.hadoop.hive.ql.udf;
+
+import org.apache.hadoop.io.Text;
+
+/**
+ * Interface to support use of standard UDFs inside the vectorized execution code path.
+ */
+public interface IUDFUnaryString {
+  Text evaluate(Text s);
+}

Modified: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java?rev=1475653&r1=1475652&r2=1475653&view=diff
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java (original)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFLower.java Thu Apr 25 07:26:22 2013
@@ -30,7 +30,7 @@ import org.apache.hadoop.io.Text;
     value = "_FUNC_(str) - Returns str with all characters changed to lowercase",
     extended = "Example:\n"
     + "  > SELECT _FUNC_('Facebook') FROM src LIMIT 1;\n" + "  'facebook'")
-public class UDFLower extends UDF {
+public class UDFLower extends UDF implements IUDFUnaryString {
   private Text t = new Text();
 
   public UDFLower() {

Modified: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java?rev=1475653&r1=1475652&r2=1475653&view=diff
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java (original)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFUpper.java Thu Apr 25 07:26:22 2013
@@ -30,7 +30,7 @@ import org.apache.hadoop.io.Text;
     value = "_FUNC_(str) - Returns str with all characters changed to uppercase",
     extended = "Example:\n"
     + "  > SELECT _FUNC_('Facebook') FROM src LIMIT 1;\n" + "  'FACEBOOK'")
-public class UDFUpper extends UDF {
+public class UDFUpper extends UDF implements IUDFUnaryString {
 
   Text t = new Text();
 

Added: hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java?rev=1475653&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java (added)
+++ hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java Thu Apr 25 07:26:22 2013
@@ -0,0 +1,286 @@
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import junit.framework.Assert;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.junit.Test;
+
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+import org.apache.hadoop.io.Text;
+
+public class TestVectorStringExpressions {
+  
+  static byte[] red; 
+  static byte[] red2; // second copy of red, different object
+  static byte[] green;
+  static byte[] emptyString;
+  static byte[] mixedUp;
+  static byte[] mixedUpLower;
+  static byte[] mixedUpUpper;
+  static byte[] multiByte;
+  static byte[] mixPercentPattern;
+  
+  static {
+    try {
+      red = "red".getBytes("UTF-8");
+      green = "green".getBytes("UTF-8");
+      emptyString = "".getBytes("UTF-8");
+      mixedUp = "mixedUp".getBytes("UTF-8");
+      mixedUpLower = "mixedup".getBytes("UTF-8");
+      mixedUpUpper = "MIXEDUP".getBytes("UTF-8");
+      mixPercentPattern = "mix%".getBytes("UTF-8"); // for use as wildcard pattern to test LIKE
+      multiByte = new byte[100];
+      addMultiByteChars(multiByte);
+    } catch (UnsupportedEncodingException e) {
+      e.printStackTrace();
+    }
+    red2 = new byte[red.length];
+    System.arraycopy(red, 0, red2, 0, red.length);
+  }
+  
+  // add some multi-byte characters to test length routine later.
+  // total characters = 4; byte length = 10
+  static void addMultiByteChars(byte[] b) {
+    int i = 0;
+    b[i++] = (byte) 0x41; // letter "A" (1 byte)
+    b[i++] = (byte) 0xC3; // Latin capital A with grave (2 bytes)
+    b[i++] = (byte) 0x80; 
+    b[i++] = (byte) 0xE2; // Euro sign (3 bytes)
+    b[i++] = (byte) 0x82;
+    b[i++] = (byte) 0xAC;
+    b[i++] = (byte) 0xF0; // Asian character U+24B62 (4 bytes)
+    b[i++] = (byte) 0xA4;
+    b[i++] = (byte) 0xAD;
+    b[i++] = (byte) 0xA2;
+  }
+  
+  @Test
+  // Load a BytesColumnVector by copying in large data, enough to force 
+  // the buffer to expand.
+  public void testLoadBytesColumnVectorByValueLargeData()  {
+    BytesColumnVector bcv = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+    bcv.initBuffer(10); // initialize with estimated element size 10
+    String s = "0123456789";
+    while (s.length() < 500) {
+      s += s;
+    }
+    byte[] b = null;
+    try {
+      b = s.getBytes("UTF-8");
+    } catch (UnsupportedEncodingException e) {
+      e.printStackTrace();
+    }
+    for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) {
+      bcv.setVal(i, b, 0, b.length);
+    }
+    Assert.assertTrue(bcv.bufferSize() >= b.length * VectorizedRowBatch.DEFAULT_SIZE);
+  }
+  
+  @Test
+  // set values by reference, copy the data out, and verify equality
+  public void testLoadBytesColumnVectorByRef() {
+    BytesColumnVector bcv = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+    String s = "red";
+    byte[] b = null;
+    try {
+      b = s.getBytes("UTF-8");
+    } catch (UnsupportedEncodingException e) {
+      e.printStackTrace();
+    }
+    for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) {
+      bcv.setRef(i, b, 0, b.length);
+    }
+    // verify
+    byte[] v = new byte[b.length];
+    for (int i = 0; i != VectorizedRowBatch.DEFAULT_SIZE; i++) {
+      Assert.assertTrue(bcv.length[i] == b.length);
+      System.arraycopy(bcv.vector[i], bcv.start[i], v, 0, b.length);
+      Assert.assertTrue(Arrays.equals(b, v));
+    }
+  }
+
+  VectorizedRowBatch makeStringBatch() {
+    // create a batch with one string ("Bytes") column
+    VectorizedRowBatch batch = new VectorizedRowBatch(1,VectorizedRowBatch.DEFAULT_SIZE);
+    BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+    batch.cols[0] = v;
+    /*
+     * Add these 3 values:
+     * 
+     * red
+     * green
+     * NULL
+     */
+    v.setRef(0, red, 0, red.length);
+    v.isNull[0] = false;
+    v.setRef(1, green, 0, green.length);
+    v.isNull[1] = false;
+    v.setRef(2,  emptyString,  0,  emptyString.length);
+    v.isNull[2] = true;
+    
+    v.noNulls = false;
+    
+    batch.size = 3;
+    return batch;
+  }
+  
+  VectorizedRowBatch makeStringBatchMixedCase() {
+    // create a batch with two string ("Bytes") columns
+    VectorizedRowBatch batch = new VectorizedRowBatch(2,VectorizedRowBatch.DEFAULT_SIZE);
+    BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+    batch.cols[0] = v;
+    BytesColumnVector outV = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+    batch.cols[1] = outV;
+    /*
+     * Add these 3 values:
+     * 
+     * mixedUp
+     * green
+     * NULL
+     */
+    v.setRef(0, mixedUp, 0, mixedUp.length);
+    v.isNull[0] = false;
+    v.setRef(1, green, 0, green.length);
+    v.isNull[1] = false;
+    v.setRef(2,  emptyString,  0,  emptyString.length);
+    v.isNull[2] = true;
+    v.noNulls = false;
+    
+    batch.size = 3;
+    return batch;
+  }
+  
+  VectorizedRowBatch makeStringBatchMixedCharSize() {
+    // create a new batch with one char column (for input) 
+    // and one long column (for output)
+    VectorizedRowBatch batch = new VectorizedRowBatch(2,VectorizedRowBatch.DEFAULT_SIZE);
+    BytesColumnVector v = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+    batch.cols[0] = v;
+    LongColumnVector outV = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+    batch.cols[1] = outV;
+    
+    /*
+     * Add these 3 values:
+     * 
+     * mixedUp
+     * green
+     * NULL
+     * <4 char string with mult-byte chars>
+     */
+    v.setRef(0, mixedUp, 0, mixedUp.length);
+    v.isNull[0] = false;
+    v.setRef(1, green, 0, green.length);
+    v.isNull[1] = false;
+    v.setRef(2,  emptyString,  0,  emptyString.length);
+    v.isNull[2] = true;
+    v.noNulls = false;
+    v.setRef(3, multiByte, 0, 10);
+    v.isNull[3] = false;
+    
+    batch.size = 4;
+    return batch;
+  }
+  
+  @Test
+  public void testColLower() {
+    // has nulls, not repeating
+    VectorizedRowBatch batch = makeStringBatchMixedCase();
+    StringLower expr = new StringLower(0,1);
+    expr.evaluate(batch);
+    BytesColumnVector outCol = (BytesColumnVector) batch.cols[1];
+    int cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]);
+    Assert.assertEquals(0,cmp);
+    Assert.assertTrue(outCol.isNull[2]);
+    int cmp2 = StringExpr.compare(green, 0, green.length, outCol.vector[1], outCol.start[1], outCol.length[1]);
+    Assert.assertEquals(0,cmp2);
+    
+    // no nulls, not repeating
+    batch = makeStringBatchMixedCase();
+    batch.cols[0].noNulls = true;
+    expr.evaluate(batch);
+    outCol = (BytesColumnVector) batch.cols[1];
+    cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]);
+    Assert.assertEquals(0,cmp);
+    Assert.assertTrue(outCol.noNulls);
+    
+    // has nulls, is repeating
+    batch = makeStringBatchMixedCase();
+    batch.cols[0].isRepeating = true;
+    expr.evaluate(batch);
+    outCol = (BytesColumnVector) batch.cols[1];
+    cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]);
+    Assert.assertEquals(0,cmp);
+    Assert.assertTrue(outCol.isRepeating);
+    Assert.assertFalse(outCol.noNulls);
+    
+    // no nulls, is repeating
+    batch = makeStringBatchMixedCase();
+    batch.cols[0].isRepeating = true;
+    batch.cols[0].noNulls = true;
+    expr.evaluate(batch);
+    outCol = (BytesColumnVector) batch.cols[1];
+    cmp = StringExpr.compare(mixedUpLower, 0, mixedUpLower.length, outCol.vector[0], outCol.start[0], outCol.length[0]);
+    Assert.assertEquals(0,cmp);
+    Assert.assertTrue(outCol.isRepeating);
+    Assert.assertTrue(outCol.noNulls);   
+  }
+  
+  @Test
+  public void testColUpper() {
+    // no nulls, not repeating
+    
+    // We don't test all the combinations because (at least currently)
+    // the logic is inherited to be the same as testColLower, which checks all the cases).
+    VectorizedRowBatch batch = makeStringBatchMixedCase();
+    StringUpper expr = new StringUpper(0,1);
+    batch.cols[0].noNulls = true;
+    expr.evaluate(batch);
+    BytesColumnVector outCol = (BytesColumnVector) batch.cols[1];
+    int cmp = StringExpr.compare(mixedUpUpper, 0, mixedUpUpper.length, outCol.vector[0], outCol.start[0], outCol.length[0]);
+    Assert.assertEquals(0,cmp);
+    Assert.assertTrue(outCol.noNulls);
+  }
+  
+  @Test
+  public void testStringLength() {
+    
+    // has nulls, not repeating
+    VectorizedRowBatch batch = makeStringBatchMixedCharSize();
+    StringLength expr = new StringLength(0,1);
+    expr.evaluate(batch);
+    LongColumnVector outCol = (LongColumnVector) batch.cols[1];
+    Assert.assertEquals(5,outCol.vector[1]); // length of green is 5
+    Assert.assertTrue(outCol.isNull[2]);
+    Assert.assertEquals(4,outCol.vector[3]); // this one has the mixed-size chars
+
+    // no nulls, not repeating
+    batch = makeStringBatchMixedCharSize();
+    batch.cols[0].noNulls = true;
+    expr.evaluate(batch);
+    outCol = (LongColumnVector) batch.cols[1];
+    Assert.assertTrue(outCol.noNulls);
+    Assert.assertEquals(4,outCol.vector[3]); // this one has the mixed-size chars
+    
+    // has nulls, is repeating
+    batch = makeStringBatchMixedCharSize();
+    batch.cols[0].isRepeating = true;
+    expr.evaluate(batch);
+    outCol = (LongColumnVector) batch.cols[1];
+    Assert.assertTrue(outCol.isRepeating);
+    Assert.assertFalse(outCol.noNulls);
+    Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp"
+    
+    // no nulls, is repeating
+    batch = makeStringBatchMixedCharSize();
+    batch.cols[0].isRepeating = true;
+    batch.cols[0].noNulls = true;
+    expr.evaluate(batch);
+    outCol = (LongColumnVector) batch.cols[1];
+    Assert.assertEquals(7, outCol.vector[0]); // length of "mixedUp"
+    Assert.assertTrue(outCol.isRepeating);
+    Assert.assertTrue(outCol.noNulls);   
+  }
+}