You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/04/19 00:30:37 UTC

svn commit: r1469604 - in /hive/branches/vectorization/ql/src: java/org/apache/hadoop/hive/ql/exec/vector/ test/org/apache/hadoop/hive/ql/exec/vector/

Author: hashutosh
Date: Thu Apr 18 22:30:36 2013
New Revision: 1469604

URL: http://svn.apache.org/r1469604
Log:
HIVE-4284 : Implement class for vectorized row batch (Eric Hanson via Ashutosh Chauhan)

Added:
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java
    hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/
    hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class supports string and binary data by value reference -- i.e. each field is 
+ * explicitly present, as opposed to provided by a dictionary reference.
+ * In some cases, all the values will be in the same byte array to begin with,
+ * but this need not be the case. If each value is in a separate byte 
+ * array to start with, or not all of the values are in the same original
+ * byte array, you can still assign data by reference into this column vector.
+ * This gives flexibility to use this in multiple situations. 
+ * <p>
+ * When setting data by reference, the caller
+ * is responsible for allocating the byte arrays used to hold the data.
+ * You can also set data by value, as long as you call the initBuffer() method first.
+ * You can mix "by value" and "by reference" in the same column vector,
+ * though that use is probably not typical.
+ */
+public class BytesColumnVector extends ColumnVector {
+  public byte[][] vector; 
+  public int[] start;          // start offset of each field
+  
+  /*
+   * The length of each field. If the value repeats for every entry, then it is stored 
+   * in vector[0] and isRepeating from the superclass is set to true.
+   */
+  public int[] length; 
+  private byte[] buffer;   // optional buffer to use when actually copying in data
+  private int nextFree;    // next free position in buffer
+  
+  // Estimate that there will be 16 bytes per entry
+  static final int DEFAULT_BUFFER_SIZE = 16 * VectorizedRowBatch.DEFAULT_SIZE;
+  
+  // Proportion of extra space to provide when allocating more buffer space. 
+  static final float EXTRA_SPACE_FACTOR = (float) 1.2;
+  
+  /**
+   * Use this constructor for normal operation.
+   * All column vectors should be the default size normally.
+   */
+  public BytesColumnVector() {
+    this(VectorizedRowBatch.DEFAULT_SIZE);
+  }
+  
+  /**
+   * Don't call this constructor except for testing purposes.
+   * 
+   * @param size  number of elements in the column vector
+   */
+  public BytesColumnVector(int size) {
+    super(size);
+    vector = new byte[size][];
+    start = new int[size];
+    length = new int[size]; 
+  }
+  
+  /** Set a field by reference.
+   *  
+   * @param elementNum index within column vector to set
+   * @param sourceBuf container of source data
+   * @param start start byte position within source
+   * @param length  length of source byte sequence
+   */
+  public void setRef(int elementNum, byte[] sourceBuf, int start, int length) {
+    vector[elementNum] = sourceBuf;
+    this.start[elementNum] = start;
+    this.length[elementNum] = length;
+  }
+  
+  /** 
+   * You must call initBuffer first before using setVal().
+   * Provide the estimated number of bytes needed to hold
+   * a full column vector worth of byte string data.
+   * 
+   * @param estimatedValueSize  Estimated size of buffer space needed
+   */
+  public void initBuffer(int estimatedValueSize) {
+    nextFree = 0;
+    
+    // if buffer is already allocated, keep using it, don't re-allocate
+    if (buffer != null) {
+      return;
+    }
+    
+    // allocate a little extra space to limit need to re-allocate
+    int bufferSize = this.vector.length * (int)(estimatedValueSize * EXTRA_SPACE_FACTOR);
+    if (bufferSize < DEFAULT_BUFFER_SIZE) {
+      bufferSize = DEFAULT_BUFFER_SIZE;
+    }
+    buffer = new byte[bufferSize]; 
+  }
+  
+  /**
+   * Initialize buffer to default size.
+   */
+  public void initBuffer() {
+    initBuffer(0);
+  }
+  
+  /**
+   * @return amount of buffer space currently allocated
+   */
+  public int bufferSize() {
+    if (buffer == null) {
+      return 0;
+    }
+    return buffer.length;
+  }
+  
+  /**
+   * Set a field by actually copying in to a local buffer.
+   * If you must actually copy data in to the array, use this method.
+   * DO NOT USE this method unless it's not practical to set data by reference with setRef().
+   * Setting data by reference tends to run a lot faster than copying data in.
+   * 
+   * @param elementNum index within column vector to set
+   * @param sourceBuf container of source data
+   * @param start start byte position within source
+   * @param length  length of source byte sequence
+   */
+  public void setVal(int elementNum, byte[] sourceBuf, int start, int length) {
+    if ((nextFree + length) > buffer.length) {
+      increaseBufferSpace(length);
+    }
+    System.arraycopy(sourceBuf, start, buffer, nextFree, length);
+    vector[elementNum] = buffer;
+    this.start[elementNum] = nextFree;
+    this.length[elementNum] = length;
+    nextFree += length;
+  }
+  
+  /**
+   * Increase buffer space enough to accommodate next element.
+   * This uses an exponential increase mechanism to rapidly 
+   * increase buffer size to enough to hold all data.
+   * As batches get re-loaded, buffer space allocated will quickly
+   * stabilize.
+   * 
+   * @param nextElemLength size of next element to be added
+   */
+  public void increaseBufferSpace(int nextElemLength) {
+    
+    // Keep doubling buffer size until there will be enough space for next element.
+    int newLength = 2 * buffer.length; 
+    while((nextFree + nextElemLength) > newLength) {
+      newLength *= 2;
+    }
+    
+    // Allocate new buffer, copy data to it, and set buffer to new buffer.
+    byte[] newBuffer = new byte[newLength];
+    System.arraycopy(buffer, 0, newBuffer, 0, nextFree);
+    buffer = newBuffer;
+  }
+
+  @Override
+  public Writable getWritableObject(int index) {
+    
+    // TODO finish this
+    throw new UnsupportedOperationException("unfinished");
+  }
+  
+}

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * ColumnVector contains the shared structure for the sub-types,
+ * including NULL information, and whether this vector
+ * repeats, i.e. has all values the same, so only the first
+ * one is set. This is used to accelerate query performance
+ * by handling a whole vector in O(1) time when applicable.
+ * 
+ * The fields are public by design since this is a performance-critical
+ * structure that is used in the inner loop of query execution.
+ */
+public abstract class ColumnVector {
+  
+  /*
+   * If hasNulls is true, then this array contains true if the value 
+   * is null, otherwise false. The array is always allocated, so a batch can be re-used 
+   * later and nulls added.
+   */
+  public boolean[] isNull; 
+  
+  // If the whole column vector has no nulls, this is true, otherwise false.
+  public boolean noNulls;
+  
+  /* 
+   * True if same value repeats for whole column vector. 
+   * If so, vector[0] holds the repeating value.
+   */
+  public boolean isRepeating; 
+  public abstract Writable getWritableObject(int index);
+
+  /**
+   * Constructor for super-class ColumnVector. This is not called directly,
+   * but used to initialize inherited fields.
+   * 
+   * @param len Vector length
+   */
+  public ColumnVector(int len) {
+    isNull = new boolean[len];
+    noNulls = true;
+    isRepeating = false;
+  }
+}
+

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class represents a nullable double precision floating point column vector.
+ * This class will be used for operations on all floating point types (float, double)
+ * and as such will use a 64-bit double value to hold the biggest possible value.
+ * During copy-in/copy-out, smaller types (i.e. float) will be converted as needed. This will
+ * reduce the amount of code that needs to be generated and also will run fast since the
+ * machine operates with 64-bit words.
+ * 
+ * The vector[] field is public by design for high-performance access in the inner
+ * loop of query execution.
+ */
+public class DoubleColumnVector extends ColumnVector {
+  public double[] vector;
+  private DoubleWritable writableObj = new DoubleWritable();
+
+  /**
+   * Use this constructor by default. All column vectors
+   * should normally be the default size.
+   */
+  public DoubleColumnVector() {
+    this(VectorizedRowBatch.DEFAULT_SIZE);
+  }
+  
+  /** 
+   * Don't use this except for testing purposes.
+   * 
+   * @param len
+   */
+  public DoubleColumnVector(int len) {
+    super(len);
+    vector = new double[len];
+  }
+
+  @Override
+  public Writable getWritableObject(int index) {
+    if (!noNulls && isNull[index]) {
+      return null;
+    } else {
+      writableObj.set(vector[index]);
+      return writableObj;
+    }
+  }
+}

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class represents a nullable int column vector.
+ * This class will be used for operations on all integer types (tinyint, smallint, int, bigint)
+ * and as such will use a 64-bit long value to hold the biggest possible value.
+ * During copy-in/copy-out, smaller int types will be converted as needed. This will
+ * reduce the amount of code that needs to be generated and also will run fast since the
+ * machine operates with 64-bit words.
+ * 
+ * The vector[] field is public by design for high-performance access in the inner
+ * loop of query execution.
+ */
+public class LongColumnVector extends ColumnVector {
+  public long[] vector;
+  private LongWritable writableObj = new LongWritable();
+  
+  /**
+   * Use this constructor by default. All column vectors
+   * should normally be the default size.
+   */
+  public LongColumnVector() {
+    this(VectorizedRowBatch.DEFAULT_SIZE);
+  }
+
+  /**
+   * Don't use this except for testing purposes.
+   * 
+   * @param len
+   */
+  public LongColumnVector(int len) {
+    super(len);
+    vector = new long[len];
+  }
+
+  @Override
+  public Writable getWritableObject(int index) {
+    if (!noNulls && isNull[index]) {
+      return null;
+    } else {
+      writableObj.set(vector[index]);
+      return writableObj;
+    }
+  }
+}

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,165 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * A VectorizedRowBatch is a set of rows, organized with each column
+ * as a vector. It is the unit of query execution, organized to minimize
+ * the cost per row and achieve high cycles-per-instruction.
+ * The major fields are public by design to allow fast and convenient
+ * access by the vectorized query execution code.
+ */
+public class VectorizedRowBatch implements Writable {
+  public int numCols;           // number of columns
+  public ColumnVector[] cols;   // a vector for each column
+  public int size;              // number of rows that qualify (i.e. haven't been filtered out)
+  public int[] selected;        // array of positions of selected values
+  
+  /*
+   * If no filtering has been applied yet, selectedInUse is false,
+   * meaning that all rows qualify. If it is true, then the selected[] array
+   * records the offsets of qualifying rows.
+   */
+  public boolean selectedInUse; 
+  
+  // If this is true, then there is no data in the batch -- we have hit the end of input.
+  public boolean endOfFile; 
+  
+  /* 
+   * This number is carefully chosen to minimize overhead and typically allows 
+   * one VectorizedRowBatch to fit in cache.
+   */
+  public static final int DEFAULT_SIZE = 1024; 
+
+  private Writable[] writableRow;
+  private int rowIteratorIndex = 0;
+
+  /** 
+   * Return a batch with the specified number of columns.
+   * This is the standard constructor -- all batches should be the same size
+   * 
+   * @param numCols the number of columns to include in the batch
+   */
+  public VectorizedRowBatch(int numCols) {
+    this(numCols, DEFAULT_SIZE);
+  }
+  
+  /**
+   * Return a batch with the specified number of columns and rows.
+   * Only call this constructor directly for testing purposes.
+   * Batch size should normally always be defaultSize.
+   * 
+   * @param numCols the number of columns to include in the batch
+   * @param size  the number of rows to include in the batch
+   */
+  public VectorizedRowBatch(int numCols, int size) {
+    this.numCols = numCols;
+    this.size = size;
+    selected = new int[size];
+    selectedInUse = false;
+    this.cols = new ColumnVector[numCols];
+    writableRow = new Writable[numCols];
+  }
+
+  public void initRowIterator(){
+    this.rowIteratorIndex = 0;
+  }
+
+  public Writable [] getNextRow() {
+    if (rowIteratorIndex >= size) {
+      return null;
+    }
+    if (selectedInUse) {
+      int i = selected[rowIteratorIndex];
+      for (int c = 0; c < numCols; c++) {
+        writableRow[c] = cols[c].getWritableObject(i);
+      }
+    } else {
+      int i = rowIteratorIndex;
+      for (int c = 0; c < numCols; c++) {
+        writableRow[c] = cols[c].getWritableObject(i);
+      }
+    }
+    return writableRow;
+  }
+
+  /** 
+   * Return count of qualifying rows.
+   * 
+   * @return number of rows that have not been filtered out
+   */
+  public long count() {
+    return size; 
+  }
+
+  @Override
+  public String toString() {
+    if (size == 0) {
+      return "";
+    }
+    StringBuilder b = new StringBuilder();
+    if (this.selectedInUse) {
+      for (int j = 0; j < size; j++) {
+        int i = selected[j];
+        int colIndex = 0;
+        for (ColumnVector cv : cols) {
+          b.append(cv.getWritableObject(i).toString());
+          colIndex++;
+          if (colIndex < cols.length) {
+            b.append('\u0001');
+          }
+        }
+        if (j < size-1) {
+          b.append('\n');
+        }
+      }
+    } else {
+      for (int i = 0; i < size; i++) {
+        int colIndex = 0;
+        for (ColumnVector cv : cols) {
+          b.append(cv.getWritableObject(i).toString());
+          colIndex++;
+          if (colIndex < cols.length) {
+            b.append('\u0001');
+          }
+        }
+        if (i < size-1) {
+          b.append('\n');
+        }
+      }
+    }
+    return b.toString();
+  }
+
+  @Override
+  public void readFields(DataInput arg0) throws IOException {
+    throw new UnsupportedOperationException("Do you really need me?");
+  }
+
+  @Override
+  public void write(DataOutput arg0) throws IOException {
+    throw new UnsupportedOperationException("Don't call me");
+  }
+}
+

Added: hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java (added)
+++ hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,221 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import java.util.Random;
+
+import junit.framework.Assert;
+import org.junit.Test;
+
+/**
+ * Test creation and basic manipulation of VectorizedRowBatch.
+ */
+public class TestVectorizedRowBatch {
+  
+  // test fields
+  static final String[] COLORS = {"red", "yellow", "green", "blue", "violet", "orange"};
+  private static byte[][] colorsBytes;
+  
+  private VectorizedRowBatch makeBatch() {
+    VectorizedRowBatch batch = new VectorizedRowBatch(3);
+    LongColumnVector lv = new LongColumnVector();
+    DoubleColumnVector dv = new DoubleColumnVector();
+    BytesColumnVector bv = new BytesColumnVector();
+    setSampleStringCol(bv);
+    batch.cols[0] = lv;
+    batch.cols[1] = dv;
+    batch.cols[2] = bv;
+    addRandomNulls(batch);
+    return batch;
+  }
+  
+  @Test
+  /**
+   * Make sure you can create a batch and that all columns are the 
+   * default size.
+   */
+  public void testVectorizedRowBatchCreate() {
+    VectorizedRowBatch batch = makeBatch();
+    Assert.assertEquals(3, batch.numCols);
+    Assert.assertEquals(VectorizedRowBatch.DEFAULT_SIZE, batch.size);
+    Assert.assertEquals(((LongColumnVector) batch.cols[0]).vector.length, 
+        VectorizedRowBatch.DEFAULT_SIZE);
+    Assert.assertEquals(((DoubleColumnVector) batch.cols[1]).vector.length, 
+        VectorizedRowBatch.DEFAULT_SIZE); 
+    Assert.assertEquals(((BytesColumnVector) batch.cols[2]).vector.length, 
+        VectorizedRowBatch.DEFAULT_SIZE);
+  }
+  
+  /*
+   * Test routines to exercise VectorizedRowBatch
+   * by filling column vectors with data and null values.
+   */
+  
+  public static void setRandom(VectorizedRowBatch batch) {
+    batch.size = VectorizedRowBatch.DEFAULT_SIZE;
+    for (int i = 0; i != batch.numCols; i++) {
+      batch.cols[i] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+      setRandomLongCol((LongColumnVector) batch.cols[i]);
+    }
+  }
+
+  public static void setSample(VectorizedRowBatch batch) {
+    batch.size = VectorizedRowBatch.DEFAULT_SIZE;
+    for (int i = 0; i != batch.numCols; i++) {
+      setSampleLongCol((LongColumnVector) batch.cols[i]);
+    }
+  }
+
+  /**
+   * Set to sample data, re-using existing columns in batch.
+   * 
+   * @param batch
+   */
+  public static void setSampleOverwrite(VectorizedRowBatch batch) {
+    
+    // Put sample data in the columns.
+    for (int i = 0; i != batch.numCols; i++) {
+      setSampleLongCol((LongColumnVector) batch.cols[i]);
+    }
+    
+    // Reset the selection vector.
+    batch.selectedInUse = false;
+    batch.size = VectorizedRowBatch.DEFAULT_SIZE;
+  }
+  
+  /**
+   * Sprinkle null values in this column vector.
+   * 
+   * @param col
+   */
+  public static void addRandomNulls(ColumnVector col) {
+    col.noNulls = false;
+    Random rand = new Random();
+    for(int i = 0; i != col.isNull.length; i++) {
+      col.isNull[i] = Math.abs(rand.nextInt() % 11) == 0;
+    }
+  }
+  
+  /**
+   * Add null values, but do it faster, by avoiding use of Random().
+   * 
+   * @param col
+   */
+  public void addSampleNulls(ColumnVector col) {
+    col.noNulls = false;
+    assert col.isNull != null;
+    for(int i = 0; i != col.isNull.length; i++) {
+      col.isNull[i] = i % 11 == 0;
+    }
+  }
+
+  public static void addRandomNulls(VectorizedRowBatch batch) {
+    for (int i = 0; i != batch.numCols; i++) {
+      addRandomNulls(batch.cols[i]);
+    }
+  }
+
+  public void addSampleNulls(VectorizedRowBatch batch) {
+    for (int i = 0; i != batch.numCols; i++) {
+      addSampleNulls(batch.cols[i]);
+    }
+  }
+  
+  /**
+   * Set vector elements to sample string data from colorsBytes string table.
+   * @param col
+   */
+  public static void setSampleStringCol(BytesColumnVector col) { 
+    initColors();
+    int size = col.vector.length;
+    for(int i = 0; i != size; i++) {
+      int pos = i % colorsBytes.length;
+      col.setRef(i, colorsBytes[pos], 0, colorsBytes[pos].length);
+    } 
+  }
+  
+  /*
+   * Initialize string table in a lazy fashion.
+   */
+  private static void initColors() {
+    if (colorsBytes == null) {
+      colorsBytes = new byte[COLORS.length][];
+      for (int i = 0; i != COLORS.length; i++) {
+        colorsBytes[i] = COLORS[i].getBytes();
+      }
+    }
+  }
+  
+
+  /**
+   * Set the vector to sample data that repeats an iteration from 0 to 99.
+   * @param col
+   */
+  public static void setSampleLongCol(LongColumnVector col) {
+    int size = col.vector.length;
+    for(int i = 0; i != size; i++) {
+      col.vector[i] = i % 100;
+    }
+  }
+
+  /**
+   * Set the vector to random data in the range 0 to 99.
+   * This has significant overhead for random number generation. Use setSample() to reduce overhead.
+   */
+  public static void setRandomLongCol(LongColumnVector col) {
+    int size = col.vector.length;
+    Random rand = new Random(System.currentTimeMillis());
+    for(int i = 0; i != size; i++) {
+      col.vector[i] = Math.abs(rand.nextInt() % 100);
+    }
+  }
+
+  public static void setRepeatingLongCol(LongColumnVector col) {
+    col.isRepeating = true;
+    col.vector[0] = 50;
+  }
+  
+  /**
+   * Set the vector to sample data that repeats an iteration from 0 to 99.
+   * @param col
+   */
+  public static void setSampleDoubleCol(DoubleColumnVector col) {
+    int size = col.vector.length;
+    for(int i = 0; i != size; i++) {
+      col.vector[i] = i % 100;
+    }
+  }
+
+  /**
+   * Set the vector to random data in the range 0 to 99.
+   * This has significant overhead for random number generation. Use setSample() to reduce overhead.
+   */
+  public static void setRandomDoubleCol(DoubleColumnVector col) {
+    int size = col.vector.length;
+    Random rand = new Random();
+    for(int i = 0; i != size; i++) {
+      col.vector[i] = Math.abs(rand.nextInt() % 100);
+    }
+  }
+
+  public static void setRepeatingDoubleCol(DoubleColumnVector col) {
+    col.isRepeating = true;
+    col.vector[0] = 50.0;
+  }
+}