You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/04/19 00:30:37 UTC
svn commit: r1469604 - in /hive/branches/vectorization/ql/src:
java/org/apache/hadoop/hive/ql/exec/vector/
test/org/apache/hadoop/hive/ql/exec/vector/
Author: hashutosh
Date: Thu Apr 18 22:30:36 2013
New Revision: 1469604
URL: http://svn.apache.org/r1469604
Log:
HIVE-4284 : Implement class for vectorized row batch (Eric Hanson via Ashutosh Chauhan)
Added:
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java
hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/
hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/BytesColumnVector.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class supports string and binary data by value reference -- i.e. each field is
+ * explicitly present, as opposed to provided by a dictionary reference.
+ * In some cases, all the values will be in the same byte array to begin with,
+ * but this need not be the case. If each value is in a separate byte
+ * array to start with, or not all of the values are in the same original
+ * byte array, you can still assign data by reference into this column vector.
+ * This gives flexibility to use this in multiple situations.
+ * <p>
+ * When setting data by reference, the caller
+ * is responsible for allocating the byte arrays used to hold the data.
+ * You can also set data by value, as long as you call the initBuffer() method first.
+ * You can mix "by value" and "by reference" in the same column vector,
+ * though that use is probably not typical.
+ */
+public class BytesColumnVector extends ColumnVector {
+ public byte[][] vector;
+ public int[] start; // start offset of each field
+
+ /*
+ * The length of each field. If the value repeats for every entry, then it is stored
+ * in vector[0] and isRepeating from the superclass is set to true.
+ */
+ public int[] length;
+ private byte[] buffer; // optional buffer to use when actually copying in data
+ private int nextFree; // next free position in buffer
+
+ // Estimate that there will be 16 bytes per entry
+ static final int DEFAULT_BUFFER_SIZE = 16 * VectorizedRowBatch.DEFAULT_SIZE;
+
+ // Proportion of extra space to provide when allocating more buffer space.
+ static final float EXTRA_SPACE_FACTOR = (float) 1.2;
+
+ /**
+ * Use this constructor for normal operation.
+ * All column vectors should be the default size normally.
+ */
+ public BytesColumnVector() {
+ this(VectorizedRowBatch.DEFAULT_SIZE);
+ }
+
+ /**
+ * Don't call this constructor except for testing purposes.
+ *
+ * @param size number of elements in the column vector
+ */
+ public BytesColumnVector(int size) {
+ super(size);
+ vector = new byte[size][];
+ start = new int[size];
+ length = new int[size];
+ }
+
+ /** Set a field by reference.
+ *
+ * @param elementNum index within column vector to set
+ * @param sourceBuf container of source data
+ * @param start start byte position within source
+ * @param length length of source byte sequence
+ */
+ public void setRef(int elementNum, byte[] sourceBuf, int start, int length) {
+ vector[elementNum] = sourceBuf;
+ this.start[elementNum] = start;
+ this.length[elementNum] = length;
+ }
+
+ /**
+ * You must call initBuffer first before using setVal().
+ * Provide the estimated number of bytes needed to hold
+ * a full column vector worth of byte string data.
+ *
+ * @param estimatedValueSize Estimated size of buffer space needed
+ */
+ public void initBuffer(int estimatedValueSize) {
+ nextFree = 0;
+
+ // if buffer is already allocated, keep using it, don't re-allocate
+ if (buffer != null) {
+ return;
+ }
+
+ // allocate a little extra space to limit need to re-allocate
+ int bufferSize = this.vector.length * (int)(estimatedValueSize * EXTRA_SPACE_FACTOR);
+ if (bufferSize < DEFAULT_BUFFER_SIZE) {
+ bufferSize = DEFAULT_BUFFER_SIZE;
+ }
+ buffer = new byte[bufferSize];
+ }
+
+ /**
+ * Initialize buffer to default size.
+ */
+ public void initBuffer() {
+ initBuffer(0);
+ }
+
+ /**
+ * @return amount of buffer space currently allocated
+ */
+ public int bufferSize() {
+ if (buffer == null) {
+ return 0;
+ }
+ return buffer.length;
+ }
+
+ /**
+ * Set a field by actually copying in to a local buffer.
+ * If you must actually copy data in to the array, use this method.
+ * DO NOT USE this method unless it's not practical to set data by reference with setRef().
+ * Setting data by reference tends to run a lot faster than copying data in.
+ *
+ * @param elementNum index within column vector to set
+ * @param sourceBuf container of source data
+ * @param start start byte position within source
+ * @param length length of source byte sequence
+ */
+ public void setVal(int elementNum, byte[] sourceBuf, int start, int length) {
+ if ((nextFree + length) > buffer.length) {
+ increaseBufferSpace(length);
+ }
+ System.arraycopy(sourceBuf, start, buffer, nextFree, length);
+ vector[elementNum] = buffer;
+ this.start[elementNum] = nextFree;
+ this.length[elementNum] = length;
+ nextFree += length;
+ }
+
+ /**
+ * Increase buffer space enough to accommodate next element.
+ * This uses an exponential increase mechanism to rapidly
+ * increase buffer size to enough to hold all data.
+ * As batches get re-loaded, buffer space allocated will quickly
+ * stabilize.
+ *
+ * @param nextElemLength size of next element to be added
+ */
+ public void increaseBufferSpace(int nextElemLength) {
+
+ // Keep doubling buffer size until there will be enough space for next element.
+ int newLength = 2 * buffer.length;
+ while((nextFree + nextElemLength) > newLength) {
+ newLength *= 2;
+ }
+
+ // Allocate new buffer, copy data to it, and set buffer to new buffer.
+ byte[] newBuffer = new byte[newLength];
+ System.arraycopy(buffer, 0, newBuffer, 0, nextFree);
+ buffer = newBuffer;
+ }
+
+ @Override
+ public Writable getWritableObject(int index) {
+
+ // TODO finish this
+ throw new UnsupportedOperationException("unfinished");
+ }
+
+}
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/ColumnVector.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * ColumnVector contains the shared structure for the sub-types,
+ * including NULL information, and whether this vector
+ * repeats, i.e. has all values the same, so only the first
+ * one is set. This is used to accelerate query performance
+ * by handling a whole vector in O(1) time when applicable.
+ *
+ * The fields are public by design since this is a performance-critical
+ * structure that is used in the inner loop of query execution.
+ */
+public abstract class ColumnVector {
+
+ /*
+ * If hasNulls is true, then this array contains true if the value
+ * is null, otherwise false. The array is always allocated, so a batch can be re-used
+ * later and nulls added.
+ */
+ public boolean[] isNull;
+
+ // If the whole column vector has no nulls, this is true, otherwise false.
+ public boolean noNulls;
+
+ /*
+ * True if same value repeats for whole column vector.
+ * If so, vector[0] holds the repeating value.
+ */
+ public boolean isRepeating;
+ public abstract Writable getWritableObject(int index);
+
+ /**
+ * Constructor for super-class ColumnVector. This is not called directly,
+ * but used to initialize inherited fields.
+ *
+ * @param len Vector length
+ */
+ public ColumnVector(int len) {
+ isNull = new boolean[len];
+ noNulls = true;
+ isRepeating = false;
+ }
+}
+
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/DoubleColumnVector.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import org.apache.hadoop.hive.serde2.io.DoubleWritable;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class represents a nullable double precision floating point column vector.
+ * This class will be used for operations on all floating point types (float, double)
+ * and as such will use a 64-bit double value to hold the biggest possible value.
+ * During copy-in/copy-out, smaller types (i.e. float) will be converted as needed. This will
+ * reduce the amount of code that needs to be generated and also will run fast since the
+ * machine operates with 64-bit words.
+ *
+ * The vector[] field is public by design for high-performance access in the inner
+ * loop of query execution.
+ */
+public class DoubleColumnVector extends ColumnVector {
+ public double[] vector;
+ private DoubleWritable writableObj = new DoubleWritable();
+
+ /**
+ * Use this constructor by default. All column vectors
+ * should normally be the default size.
+ */
+ public DoubleColumnVector() {
+ this(VectorizedRowBatch.DEFAULT_SIZE);
+ }
+
+ /**
+ * Don't use this except for testing purposes.
+ *
+ * @param len
+ */
+ public DoubleColumnVector(int len) {
+ super(len);
+ vector = new double[len];
+ }
+
+ @Override
+ public Writable getWritableObject(int index) {
+ if (!noNulls && isNull[index]) {
+ return null;
+ } else {
+ writableObj.set(vector[index]);
+ return writableObj;
+ }
+ }
+}
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/LongColumnVector.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class represents a nullable int column vector.
+ * This class will be used for operations on all integer types (tinyint, smallint, int, bigint)
+ * and as such will use a 64-bit long value to hold the biggest possible value.
+ * During copy-in/copy-out, smaller int types will be converted as needed. This will
+ * reduce the amount of code that needs to be generated and also will run fast since the
+ * machine operates with 64-bit words.
+ *
+ * The vector[] field is public by design for high-performance access in the inner
+ * loop of query execution.
+ */
+public class LongColumnVector extends ColumnVector {
+ public long[] vector;
+ private LongWritable writableObj = new LongWritable();
+
+ /**
+ * Use this constructor by default. All column vectors
+ * should normally be the default size.
+ */
+ public LongColumnVector() {
+ this(VectorizedRowBatch.DEFAULT_SIZE);
+ }
+
+ /**
+ * Don't use this except for testing purposes.
+ *
+ * @param len
+ */
+ public LongColumnVector(int len) {
+ super(len);
+ vector = new long[len];
+ }
+
+ @Override
+ public Writable getWritableObject(int index) {
+ if (!noNulls && isNull[index]) {
+ return null;
+ } else {
+ writableObj.set(vector[index]);
+ return writableObj;
+ }
+ }
+}
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/VectorizedRowBatch.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,165 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+
+/**
+ * A VectorizedRowBatch is a set of rows, organized with each column
+ * as a vector. It is the unit of query execution, organized to minimize
+ * the cost per row and achieve high cycles-per-instruction.
+ * The major fields are public by design to allow fast and convenient
+ * access by the vectorized query execution code.
+ */
+public class VectorizedRowBatch implements Writable {
+ public int numCols; // number of columns
+ public ColumnVector[] cols; // a vector for each column
+ public int size; // number of rows that qualify (i.e. haven't been filtered out)
+ public int[] selected; // array of positions of selected values
+
+ /*
+ * If no filtering has been applied yet, selectedInUse is false,
+ * meaning that all rows qualify. If it is true, then the selected[] array
+ * records the offsets of qualifying rows.
+ */
+ public boolean selectedInUse;
+
+ // If this is true, then there is no data in the batch -- we have hit the end of input.
+ public boolean endOfFile;
+
+ /*
+ * This number is carefully chosen to minimize overhead and typically allows
+ * one VectorizedRowBatch to fit in cache.
+ */
+ public static final int DEFAULT_SIZE = 1024;
+
+ private Writable[] writableRow;
+ private int rowIteratorIndex = 0;
+
+ /**
+ * Return a batch with the specified number of columns.
+ * This is the standard constructor -- all batches should be the same size
+ *
+ * @param numCols the number of columns to include in the batch
+ */
+ public VectorizedRowBatch(int numCols) {
+ this(numCols, DEFAULT_SIZE);
+ }
+
+ /**
+ * Return a batch with the specified number of columns and rows.
+ * Only call this constructor directly for testing purposes.
+ * Batch size should normally always be defaultSize.
+ *
+ * @param numCols the number of columns to include in the batch
+ * @param size the number of rows to include in the batch
+ */
+ public VectorizedRowBatch(int numCols, int size) {
+ this.numCols = numCols;
+ this.size = size;
+ selected = new int[size];
+ selectedInUse = false;
+ this.cols = new ColumnVector[numCols];
+ writableRow = new Writable[numCols];
+ }
+
+ public void initRowIterator(){
+ this.rowIteratorIndex = 0;
+ }
+
+ public Writable [] getNextRow() {
+ if (rowIteratorIndex >= size) {
+ return null;
+ }
+ if (selectedInUse) {
+ int i = selected[rowIteratorIndex];
+ for (int c = 0; c < numCols; c++) {
+ writableRow[c] = cols[c].getWritableObject(i);
+ }
+ } else {
+ int i = rowIteratorIndex;
+ for (int c = 0; c < numCols; c++) {
+ writableRow[c] = cols[c].getWritableObject(i);
+ }
+ }
+ return writableRow;
+ }
+
+ /**
+ * Return count of qualifying rows.
+ *
+ * @return number of rows that have not been filtered out
+ */
+ public long count() {
+ return size;
+ }
+
+ @Override
+ public String toString() {
+ if (size == 0) {
+ return "";
+ }
+ StringBuilder b = new StringBuilder();
+ if (this.selectedInUse) {
+ for (int j = 0; j < size; j++) {
+ int i = selected[j];
+ int colIndex = 0;
+ for (ColumnVector cv : cols) {
+ b.append(cv.getWritableObject(i).toString());
+ colIndex++;
+ if (colIndex < cols.length) {
+ b.append('\u0001');
+ }
+ }
+ if (j < size-1) {
+ b.append('\n');
+ }
+ }
+ } else {
+ for (int i = 0; i < size; i++) {
+ int colIndex = 0;
+ for (ColumnVector cv : cols) {
+ b.append(cv.getWritableObject(i).toString());
+ colIndex++;
+ if (colIndex < cols.length) {
+ b.append('\u0001');
+ }
+ }
+ if (i < size-1) {
+ b.append('\n');
+ }
+ }
+ }
+ return b.toString();
+ }
+
+ @Override
+ public void readFields(DataInput arg0) throws IOException {
+ throw new UnsupportedOperationException("Do you really need me?");
+ }
+
+ @Override
+ public void write(DataOutput arg0) throws IOException {
+ throw new UnsupportedOperationException("Don't call me");
+ }
+}
+
Added: hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java?rev=1469604&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java (added)
+++ hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/TestVectorizedRowBatch.java Thu Apr 18 22:30:36 2013
@@ -0,0 +1,221 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector;
+
+import java.util.Random;
+
+import junit.framework.Assert;
+import org.junit.Test;
+
+/**
+ * Test creation and basic manipulation of VectorizedRowBatch.
+ */
+public class TestVectorizedRowBatch {
+
+ // test fields
+ static final String[] COLORS = {"red", "yellow", "green", "blue", "violet", "orange"};
+ private static byte[][] colorsBytes;
+
+ private VectorizedRowBatch makeBatch() {
+ VectorizedRowBatch batch = new VectorizedRowBatch(3);
+ LongColumnVector lv = new LongColumnVector();
+ DoubleColumnVector dv = new DoubleColumnVector();
+ BytesColumnVector bv = new BytesColumnVector();
+ setSampleStringCol(bv);
+ batch.cols[0] = lv;
+ batch.cols[1] = dv;
+ batch.cols[2] = bv;
+ addRandomNulls(batch);
+ return batch;
+ }
+
+ @Test
+ /**
+ * Make sure you can create a batch and that all columns are the
+ * default size.
+ */
+ public void testVectorizedRowBatchCreate() {
+ VectorizedRowBatch batch = makeBatch();
+ Assert.assertEquals(3, batch.numCols);
+ Assert.assertEquals(VectorizedRowBatch.DEFAULT_SIZE, batch.size);
+ Assert.assertEquals(((LongColumnVector) batch.cols[0]).vector.length,
+ VectorizedRowBatch.DEFAULT_SIZE);
+ Assert.assertEquals(((DoubleColumnVector) batch.cols[1]).vector.length,
+ VectorizedRowBatch.DEFAULT_SIZE);
+ Assert.assertEquals(((BytesColumnVector) batch.cols[2]).vector.length,
+ VectorizedRowBatch.DEFAULT_SIZE);
+ }
+
+ /*
+ * Test routines to exercise VectorizedRowBatch
+ * by filling column vectors with data and null values.
+ */
+
+ public static void setRandom(VectorizedRowBatch batch) {
+ batch.size = VectorizedRowBatch.DEFAULT_SIZE;
+ for (int i = 0; i != batch.numCols; i++) {
+ batch.cols[i] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
+ setRandomLongCol((LongColumnVector) batch.cols[i]);
+ }
+ }
+
+ public static void setSample(VectorizedRowBatch batch) {
+ batch.size = VectorizedRowBatch.DEFAULT_SIZE;
+ for (int i = 0; i != batch.numCols; i++) {
+ setSampleLongCol((LongColumnVector) batch.cols[i]);
+ }
+ }
+
+ /**
+ * Set to sample data, re-using existing columns in batch.
+ *
+ * @param batch
+ */
+ public static void setSampleOverwrite(VectorizedRowBatch batch) {
+
+ // Put sample data in the columns.
+ for (int i = 0; i != batch.numCols; i++) {
+ setSampleLongCol((LongColumnVector) batch.cols[i]);
+ }
+
+ // Reset the selection vector.
+ batch.selectedInUse = false;
+ batch.size = VectorizedRowBatch.DEFAULT_SIZE;
+ }
+
+ /**
+ * Sprinkle null values in this column vector.
+ *
+ * @param col
+ */
+ public static void addRandomNulls(ColumnVector col) {
+ col.noNulls = false;
+ Random rand = new Random();
+ for(int i = 0; i != col.isNull.length; i++) {
+ col.isNull[i] = Math.abs(rand.nextInt() % 11) == 0;
+ }
+ }
+
+ /**
+ * Add null values, but do it faster, by avoiding use of Random().
+ *
+ * @param col
+ */
+ public void addSampleNulls(ColumnVector col) {
+ col.noNulls = false;
+ assert col.isNull != null;
+ for(int i = 0; i != col.isNull.length; i++) {
+ col.isNull[i] = i % 11 == 0;
+ }
+ }
+
+ public static void addRandomNulls(VectorizedRowBatch batch) {
+ for (int i = 0; i != batch.numCols; i++) {
+ addRandomNulls(batch.cols[i]);
+ }
+ }
+
+ public void addSampleNulls(VectorizedRowBatch batch) {
+ for (int i = 0; i != batch.numCols; i++) {
+ addSampleNulls(batch.cols[i]);
+ }
+ }
+
+ /**
+ * Set vector elements to sample string data from colorsBytes string table.
+ * @param col
+ */
+ public static void setSampleStringCol(BytesColumnVector col) {
+ initColors();
+ int size = col.vector.length;
+ for(int i = 0; i != size; i++) {
+ int pos = i % colorsBytes.length;
+ col.setRef(i, colorsBytes[pos], 0, colorsBytes[pos].length);
+ }
+ }
+
+ /*
+ * Initialize string table in a lazy fashion.
+ */
+ private static void initColors() {
+ if (colorsBytes == null) {
+ colorsBytes = new byte[COLORS.length][];
+ for (int i = 0; i != COLORS.length; i++) {
+ colorsBytes[i] = COLORS[i].getBytes();
+ }
+ }
+ }
+
+
+ /**
+ * Set the vector to sample data that repeats an iteration from 0 to 99.
+ * @param col
+ */
+ public static void setSampleLongCol(LongColumnVector col) {
+ int size = col.vector.length;
+ for(int i = 0; i != size; i++) {
+ col.vector[i] = i % 100;
+ }
+ }
+
+ /**
+ * Set the vector to random data in the range 0 to 99.
+ * This has significant overhead for random number generation. Use setSample() to reduce overhead.
+ */
+ public static void setRandomLongCol(LongColumnVector col) {
+ int size = col.vector.length;
+ Random rand = new Random(System.currentTimeMillis());
+ for(int i = 0; i != size; i++) {
+ col.vector[i] = Math.abs(rand.nextInt() % 100);
+ }
+ }
+
+ public static void setRepeatingLongCol(LongColumnVector col) {
+ col.isRepeating = true;
+ col.vector[0] = 50;
+ }
+
+ /**
+ * Set the vector to sample data that repeats an iteration from 0 to 99.
+ * @param col
+ */
+ public static void setSampleDoubleCol(DoubleColumnVector col) {
+ int size = col.vector.length;
+ for(int i = 0; i != size; i++) {
+ col.vector[i] = i % 100;
+ }
+ }
+
+ /**
+ * Set the vector to random data in the range 0 to 99.
+ * This has significant overhead for random number generation. Use setSample() to reduce overhead.
+ */
+ public static void setRandomDoubleCol(DoubleColumnVector col) {
+ int size = col.vector.length;
+ Random rand = new Random();
+ for(int i = 0; i != size; i++) {
+ col.vector[i] = Math.abs(rand.nextInt() % 100);
+ }
+ }
+
+ public static void setRepeatingDoubleCol(DoubleColumnVector col) {
+ col.isRepeating = true;
+ col.vector[0] = 50.0;
+ }
+}