You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/07/02 00:26:52 UTC
svn commit: r1498726 - in /hive/branches/vectorization/ql/src:
java/org/apache/hadoop/hive/ql/exec/vector/expressions/
test/org/apache/hadoop/hive/ql/exec/vector/expressions/
Author: hashutosh
Date: Mon Jul 1 22:26:51 2013
New Revision: 1498726
URL: http://svn.apache.org/r1498726
Log:
HIVE-4495 : Implement vectorized string substr (Timothy Chen via Ashutosh Chauhan)
Added:
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java
hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
Modified:
hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java?rev=1498726&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java Mon Jul 1 22:26:51 2013
@@ -0,0 +1,177 @@
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.io.UnsupportedEncodingException;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+/**
+ * This class provides the implementation of vectorized substring, with a single start index parameter.
+ * If the start index is invalid (outside of the string boundaries) then an empty string will be in the output.
+ */
+public class StringSubstrColStart extends VectorExpression {
+ private final int startIdx;
+ private final int colNum;
+ private final int outputColumn;
+ private static byte[] EMPTY_STRING;
+
+ // Populating the Empty string bytes. Putting it as static since it should be immutable and can be shared
+ static {
+ try {
+ EMPTY_STRING = "".getBytes("UTF-8");
+ } catch(UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public StringSubstrColStart(int colNum, int startIdx, int outputColumn) {
+ this.colNum = colNum;
+ this.startIdx = startIdx;
+ this.outputColumn = outputColumn;
+ }
+
+ /**
+ * Given the substring start index param it finds the starting offset of the passed in utf8 string byte array
+ * that matches the index.
+ * @param utf8String byte array that holds the utf8 string
+ * @param start start offset of the byte array the string starts at
+ * @param len length of the bytes the string holds in the byte array
+ * @param substrStart the Start index for the substring operation
+ */
+ static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
+ int curIdx = -1;
+
+ if (substrStart < 0) {
+ int length = 0;
+ for (int i = start; i != len; ++i) {
+ if ((utf8String[i] & 0xc0) != 0x80) {
+ ++length;
+ }
+ }
+
+ if (-length > substrStart) {
+ return -1;
+ }
+
+ substrStart = length + substrStart;
+ }
+
+ int end = start + len;
+ for (int i = start; i != end; ++i) {
+ if ((utf8String[i] & 0xc0) != 0x80) {
+ ++curIdx;
+ if (curIdx == substrStart) {
+ return i;
+ }
+ }
+ }
+ return -1;
+ }
+
+ @Override
+ public void evaluate(VectorizedRowBatch batch) {
+ if (childExpressions != null) {
+ super.evaluateChildren(batch);
+ }
+
+ BytesColumnVector inV = (BytesColumnVector) batch.cols[colNum];
+ BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn];
+
+ int n = batch.size;
+
+ if (n == 0) {
+ return;
+ }
+
+
+ byte[][] vector = inV.vector;
+ int[] sel = batch.selected;
+ int[] len = inV.length;
+ int[] start = inV.start;
+
+ if (inV.isRepeating) {
+ outV.isRepeating = true;
+ if (!inV.noNulls && inV.isNull[0]) {
+ outV.isNull[0] = true;
+ outV.noNulls = false;
+ outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length);
+ return;
+ } else {
+ outV.noNulls = true;
+ int offset = getSubstrStartOffset(vector[0], sel[0], len[0], startIdx);
+ if (offset != -1) {
+ outV.setRef(0, vector[0], offset, len[0] - offset);
+ } else {
+ outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length);
+ }
+ }
+ } else {
+ outV.isRepeating = false;
+ if (batch.selectedInUse) {
+ if (!inV.noNulls) {
+ outV.noNulls = false;
+ for (int i = 0; i != n; ++i) {
+ int selected = sel[i];
+ if (!inV.isNull[selected]) {
+ int offset = getSubstrStartOffset(vector[selected], start[selected], len[selected], startIdx);
+ outV.isNull[selected] = false;
+ if (offset != -1) {
+ outV.setRef(selected, vector[selected], offset, len[selected] - offset);
+ } else {
+ outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length);
+ }
+ } else {
+ outV.isNull[selected] = true;
+ }
+ }
+ } else {
+ outV.noNulls = true;
+ for (int i = 0; i != n; ++i) {
+ int selected = sel[i];
+ int offset = getSubstrStartOffset(vector[selected], start[selected], len[selected], startIdx);
+ if (offset != -1) {
+ outV.setRef(selected, vector[selected], offset, len[selected] - offset);
+ } else {
+ outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length);
+ }
+ }
+ }
+ } else {
+ if (!inV.noNulls) {
+ outV.noNulls = false;
+ System.arraycopy(inV.isNull, 0, outV.isNull, 0, n);
+ for (int i = 0; i != n; ++i) {
+ if (!inV.isNull[i]) {
+ int offset = getSubstrStartOffset(vector[i], start[i], len[i], startIdx);
+ if (offset != -1) {
+ outV.setRef(i, vector[i], offset, len[i] - offset);
+ } else {
+ outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length);
+ }
+ }
+ }
+ } else {
+ outV.noNulls = true;
+ for (int i = 0; i != n; ++i) {
+ int offset = getSubstrStartOffset(vector[i], start[i], len[i], startIdx);
+ if (offset != -1) {
+ outV.setRef(i, vector[i], offset, len[i] - offset);
+ } else {
+ outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ public int getOutputColumn() {
+ return outputColumn;
+ }
+
+ @Override
+ public String getOutputType() {
+ return "string";
+ }
+}
Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java?rev=1498726&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java Mon Jul 1 22:26:51 2013
@@ -0,0 +1,195 @@
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.io.UnsupportedEncodingException;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+/**
+ * This class provides the implementation of vectorized substring, with a start index and length parameters.
+ * If the start index is invalid (outside of the string boundaries) then an empty string will be in the output.
+ * If the length provided is longer then the string boundary, then it will replace it with the ending index.
+ */
+public class StringSubstrColStartLen extends VectorExpression {
+ private final int startIdx;
+ private final int colNum;
+ private final int length;
+ private final int outputColumn;
+ private final int[] offsetArray;
+ private static byte[] EMPTY_STRING;
+
+ // Populating the Empty string bytes. Putting it as static since it should be immutable and can be shared
+ static {
+ try {
+ EMPTY_STRING = "".getBytes("UTF-8");
+ } catch(UnsupportedEncodingException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public StringSubstrColStartLen(int colNum, int startIdx, int length, int outputColumn) {
+ this.colNum = colNum;
+ this.startIdx = startIdx;
+ this.length = length;
+ this.outputColumn = outputColumn;
+ offsetArray = new int[2];
+ }
+
+ /**
+ * Populates the substring start and end offsets based on the substring start and length params.
+ *
+ * @param utf8String byte array that holds the utf8 string
+ * @param start start offset of the byte array the string starts at
+ * @param len length of the bytes the string holds in the byte array
+ * @param substrStart the Start index for the substring operation
+ * @param substrLen the length of the substring
+ * @param offsetArray the array that indexes are populated to. Assume its length >= 2.
+ */
+ static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart, int substrLength, int[] offsetArray) {
+ int curIdx = -1;
+ offsetArray[0] = -1;
+ offsetArray[1] = -1;
+
+ if (substrStart < 0) {
+ int length = 0;
+ for (int i = start; i != len; ++i) {
+ if ((utf8String[i] & 0xc0) != 0x80) {
+ ++length;
+ }
+ }
+
+ if (-length > substrStart) {
+ return;
+ }
+
+ substrStart = length + substrStart;
+ }
+
+
+ int endIdx = substrStart + substrLength - 1;
+ int end = start + len;
+ for (int i = start; i != end; ++i) {
+ if ((utf8String[i] & 0xc0) != 0x80) {
+ ++curIdx;
+ if (curIdx == substrStart) {
+ offsetArray[0] = i;
+ } else if (curIdx - 1 == endIdx) {
+ offsetArray[1] = i - offsetArray[0];
+ }
+ }
+ }
+
+ if (offsetArray[1] == -1) {
+ offsetArray[1] = end - offsetArray[0];
+ }
+ }
+
+ @Override
+ public void evaluate(VectorizedRowBatch batch) {
+ if (childExpressions != null) {
+ super.evaluateChildren(batch);
+ }
+
+ BytesColumnVector inV = (BytesColumnVector) batch.cols[colNum];
+ BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn];
+
+ int n = batch.size;
+
+ if (n == 0) {
+ return;
+ }
+
+ byte[][] vector = inV.vector;
+ int[] sel = batch.selected;
+ int[] len = inV.length;
+ int[] start = inV.start;
+
+ if (inV.isRepeating) {
+ outV.isRepeating = true;
+ if (!inV.noNulls && inV.isNull[0]) {
+ outV.isNull[0] = true;
+ outV.noNulls = false;
+ outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length);
+ return;
+ } else {
+ outV.noNulls = true;
+ populateSubstrOffsets(vector[0], sel[0], len[0], startIdx, length, offsetArray);
+ if (offsetArray[0] != -1) {
+ outV.setRef(0, vector[0], offsetArray[0], offsetArray[1]);
+ } else {
+ outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length);
+ }
+ }
+ } else {
+ outV.isRepeating = false;
+ if (batch.selectedInUse) {
+ if (!inV.noNulls) {
+ outV.noNulls = false;
+ for (int i = 0; i != n; ++i) {
+ int selected = sel[i];
+ if (!inV.isNull[selected]) {
+ outV.isNull[selected] = false;
+ populateSubstrOffsets(vector[selected], start[selected], len[selected], startIdx, length, offsetArray);
+ if (offsetArray[0] != -1) {
+ outV.setRef(selected, vector[selected], offsetArray[0], offsetArray[1]);
+ } else {
+ outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length);
+ }
+ } else {
+ outV.isNull[selected] = true;
+ }
+ }
+ } else {
+ outV.noNulls = true;
+ for (int i = 0; i != n; ++i) {
+ int selected = sel[i];
+ outV.isNull[selected] = false;
+ populateSubstrOffsets(vector[selected], start[selected], len[selected], startIdx, length, offsetArray);
+ if (offsetArray[0] != -1) {
+ outV.setRef(selected, vector[selected], offsetArray[0], offsetArray[1]);
+ } else {
+ outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length);
+ }
+ }
+ }
+ } else {
+ if (!inV.noNulls) {
+ System.arraycopy(inV.isNull, 0, outV.isNull, 0, n);
+ outV.noNulls = false;
+ for (int i = 0; i != n; ++i) {
+ if (!inV.isNull[i]) {
+ populateSubstrOffsets(vector[i], start[i], len[i], startIdx, length, offsetArray);
+ if (offsetArray[0] != -1) {
+ outV.setRef(i, vector[i], offsetArray[0], offsetArray[1]);
+ } else {
+ outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length);
+ }
+ }
+ }
+ } else {
+ outV.noNulls = true;
+ for (int i = 0; i != n; ++i) {
+ outV.isNull[i] = false;
+ populateSubstrOffsets(vector[i], start[i], len[i], startIdx, length, offsetArray);
+ if (offsetArray[0] != -1) {
+ outV.setRef(i, vector[i], offsetArray[0], offsetArray[1]);
+ } else {
+ outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ @Override
+ public int getOutputColumn() {
+ return outputColumn;
+ }
+
+ @Override
+ public String getOutputType() {
+ return "string";
+ }
+
+}
Modified: hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java?rev=1498726&r1=1498725&r2=1498726&view=diff
==============================================================================
--- hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java (original)
+++ hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java Mon Jul 1 22:26:51 2013
@@ -18,21 +18,20 @@
package org.apache.hadoop.hive.ql.exec.vector.expressions;
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+
import junit.framework.Assert;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColEqualStringScalar;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColLessStringScalar;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColGreaterEqualStringScalar;
import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColLessStringCol;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.
- FilterStringColGreaterEqualStringScalar;
-import org.junit.Test;
-
-import java.io.UnsupportedEncodingException;
-import java.util.Arrays;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColLessStringScalar;
import org.apache.hadoop.io.Text;
+import org.junit.Test;
/**
* Test vectorized expression and filter evaluation for strings.
@@ -167,79 +166,79 @@ public class TestVectorStringExpressions
Assert.assertTrue(batch.selected[0] == 0);
Assert.assertTrue(batch.selected[1] == 1);
}
-
+
@Test
public void testStringColCompareStringColFilter() {
VectorizedRowBatch batch;
VectorExpression expr;
-
+
/* input data
- *
+ *
* col0 col1
* ===============
* blue red
* green green
* red blue
* NULL red col0 data is empty string if we un-set NULL property
- */
-
+ */
+
// nulls possible on left, right
batch = makeStringBatchForColColCompare();
expr = new FilterStringColLessStringCol(0,1);
expr.evaluate(batch);
Assert.assertEquals(1, batch.size);
Assert.assertEquals(0, batch.selected[0]);
-
+
// no nulls possible
batch = makeStringBatchForColColCompare();
batch.cols[0].noNulls = true;
batch.cols[1].noNulls = true;
expr.evaluate(batch);
Assert.assertEquals(2, batch.size);
- Assert.assertEquals(3, batch.selected[1]);
-
+ Assert.assertEquals(3, batch.selected[1]);
+
// nulls on left, no nulls on right
batch = makeStringBatchForColColCompare();
batch.cols[1].noNulls = true;
expr.evaluate(batch);
Assert.assertEquals(1, batch.size);
Assert.assertEquals(0, batch.selected[0]);
-
+
// nulls on right, no nulls on left
batch = makeStringBatchForColColCompare();
batch.cols[0].noNulls = true;
batch.cols[1].isNull[3] = true;
expr.evaluate(batch);
Assert.assertEquals(1, batch.size);
- Assert.assertEquals(0, batch.selected[0]);
-
+ Assert.assertEquals(0, batch.selected[0]);
+
// Now vary isRepeating
// nulls possible on left, right
-
+
// left repeats
batch = makeStringBatchForColColCompare();
batch.cols[0].isRepeating = true;
expr.evaluate(batch);
Assert.assertEquals(3, batch.size);
Assert.assertEquals(3, batch.selected[2]);
-
+
// right repeats
batch = makeStringBatchForColColCompare();
batch.cols[1].isRepeating = true;
expr.evaluate(batch);
Assert.assertEquals(2, batch.size); // first 2 qualify
Assert.assertEquals(1, batch.selected[1]);
-
+
// left and right repeat
batch = makeStringBatchForColColCompare();
batch.cols[0].isRepeating = true;
batch.cols[1].isRepeating = true;
expr.evaluate(batch);
Assert.assertEquals(4, batch.size);
-
+
// Now vary isRepeating
// nulls possible only on left
-
+
// left repeats
batch = makeStringBatchForColColCompare();
batch.cols[0].isRepeating = true;
@@ -247,7 +246,7 @@ public class TestVectorStringExpressions
expr.evaluate(batch);
Assert.assertEquals(3, batch.size);
Assert.assertEquals(3, batch.selected[2]);
-
+
// left repeats and is null
batch = makeStringBatchForColColCompare();
batch.cols[0].isRepeating = true;
@@ -255,15 +254,15 @@ public class TestVectorStringExpressions
batch.cols[0].isNull[0] = true;
expr.evaluate(batch);
Assert.assertEquals(0, batch.size);
-
+
// right repeats
batch = makeStringBatchForColColCompare();
batch.cols[1].isRepeating = true;
batch.cols[1].noNulls = true;
expr.evaluate(batch);
- Assert.assertEquals(3, batch.size);
+ Assert.assertEquals(3, batch.size);
Assert.assertEquals(1, batch.selected[1]);
-
+
// left and right repeat
batch = makeStringBatchForColColCompare();
batch.cols[0].isRepeating = true;
@@ -272,10 +271,10 @@ public class TestVectorStringExpressions
expr.evaluate(batch);
Assert.assertEquals(4, batch.size);
-
+
// Now vary isRepeating
// nulls possible only on right
-
+
// left repeats
batch = makeStringBatchForColColCompare();
batch.cols[0].isRepeating = true;
@@ -284,22 +283,22 @@ public class TestVectorStringExpressions
expr.evaluate(batch);
Assert.assertEquals(2, batch.size);
Assert.assertEquals(3, batch.selected[1]);
-
+
// right repeats
batch = makeStringBatchForColColCompare();
batch.cols[1].isRepeating = true;
batch.cols[0].noNulls = true;
expr.evaluate(batch);
- Assert.assertEquals(3, batch.size);
+ Assert.assertEquals(3, batch.size);
Assert.assertEquals(3, batch.selected[2]);
-
+
// right repeats and is null
batch = makeStringBatchForColColCompare();
batch.cols[1].isRepeating = true;
batch.cols[0].noNulls = true;
batch.cols[1].isNull[0] = true;
expr.evaluate(batch);
- Assert.assertEquals(0, batch.size);
+ Assert.assertEquals(0, batch.size);
// left and right repeat
batch = makeStringBatchForColColCompare();
@@ -308,7 +307,7 @@ public class TestVectorStringExpressions
batch.cols[0].noNulls = true;
expr.evaluate(batch);
Assert.assertEquals(4, batch.size);
-
+
// left and right repeat and right is null
batch = makeStringBatchForColColCompare();
batch.cols[0].isRepeating = true;
@@ -316,7 +315,7 @@ public class TestVectorStringExpressions
batch.cols[0].noNulls = true;
batch.cols[1].isNull[0] = true;
expr.evaluate(batch);
- Assert.assertEquals(0, batch.size);
+ Assert.assertEquals(0, batch.size);
}
VectorizedRowBatch makeStringBatch() {
@@ -537,7 +536,7 @@ public class TestVectorStringExpressions
batch.size = 3;
return batch;
}
-
+
private VectorizedRowBatch makeStringBatchForColColCompare() {
VectorizedRowBatch batch = new VectorizedRowBatch(3);
BytesColumnVector v = new BytesColumnVector();
@@ -565,10 +564,10 @@ public class TestVectorStringExpressions
v2.setRef(3, red, 0, red.length);
v2.isNull[3] = false;
v2.noNulls = false;
-
+
batch.size = 4;
return batch;
- }
+ }
@Test
public void testStringLike() {
@@ -938,4 +937,438 @@ public class TestVectorStringExpressions
outCol.start[0], outCol.length[0]);
Assert.assertEquals(0, cmp);
}
-}
+
+ @Test
+ public void testSubstrStart() throws UnsupportedEncodingException {
+ // Testing no nulls and no repeating
+ VectorizedRowBatch batch = new VectorizedRowBatch(2);
+ BytesColumnVector v = new BytesColumnVector();
+ batch.cols[0] = v;
+ BytesColumnVector outV = new BytesColumnVector();
+ batch.cols[1] = outV;
+ byte[] data1 = "abcd string".getBytes("UTF-8");
+ byte[] data2 = "efgh string".getBytes("UTF-8");
+ byte[] data3 = "efgh".getBytes("UTF-8");
+ batch.size = 3;
+ v.noNulls = true;
+ v.setRef(0, data1, 0, data1.length);
+ v.isNull[0] = false;
+ v.setRef(1, data2, 0, data2.length);
+ v.isNull[1] = false;
+ v.setRef(2, data3, 0, data3.length);
+ v.isNull[2] = false;
+
+ StringSubstrColStart expr = new StringSubstrColStart(0, 5, 1);
+ expr.evaluate(batch);
+ BytesColumnVector outCol = (BytesColumnVector) batch.cols[1];
+ Assert.assertEquals(3, batch.size);
+ Assert.assertTrue(outCol.noNulls);
+ Assert.assertFalse(outCol.isRepeating);
+ byte[] expected = "string".getBytes("UTF-8");
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+ )
+ );
+
+ // This yields empty because starting idx is out of bounds.
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+ )
+ );
+
+ outCol.noNulls = false;
+ outCol.isRepeating = true;
+
+ // Testing negative substring index.
+ // For a string with length 11, start idx 5 should yield same results as -6
+
+ expr = new StringSubstrColStart(0, -6, 1);
+ expr.evaluate(batch);
+ outCol = (BytesColumnVector) batch.cols[1];
+ Assert.assertEquals(3, batch.size);
+ Assert.assertTrue(outCol.noNulls);
+ Assert.assertFalse(outCol.isRepeating);
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+ )
+ );
+
+ outCol.noNulls = false;
+ outCol.isRepeating = true;
+
+ // Testing substring starting from index 0
+
+ expr = new StringSubstrColStart(0, 0, 1);
+ expr.evaluate(batch);
+ Assert.assertEquals(3, batch.size);
+ Assert.assertTrue(outCol.noNulls);
+ Assert.assertFalse(outCol.isRepeating);
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ data1, 0, data1.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ data2, 0, data2.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ data3, 0, data3.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+ )
+ );
+
+ outV.noNulls = false;
+ outV.isRepeating = true;
+
+ // Testing with nulls
+
+ expr = new StringSubstrColStart(0, 5, 1);
+ v.noNulls = false;
+ v.isNull[0] = true;
+ expr.evaluate(batch);
+ Assert.assertEquals(3, batch.size);
+ Assert.assertFalse(outV.noNulls);
+ Assert.assertTrue(outV.isNull[0]);
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+ )
+ );
+
+ outCol.noNulls = false;
+ outCol.isRepeating = false;
+
+ // Testing with repeating and no nulls
+
+ outV = new BytesColumnVector();
+ v = new BytesColumnVector();
+ v.isRepeating = true;
+ v.noNulls = true;
+ v.setRef(0, data1, 0, data1.length);
+ batch = new VectorizedRowBatch(2);
+ batch.cols[0] = v;
+ batch.cols[1] = outV;
+ expr.evaluate(batch);
+ outCol = (BytesColumnVector) batch.cols[1];
+ expected = "string".getBytes("UTF-8");
+ Assert.assertTrue(outV.isRepeating);
+ Assert.assertTrue(outV.noNulls);
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+
+ // Testing multiByte string substring
+
+ v = new BytesColumnVector();
+ v.isRepeating = false;
+ v.noNulls = true;
+ v.setRef(0, multiByte, 0, 10);
+ batch.cols[0] = v;
+ batch.cols[1] = outV;
+ outV.isRepeating = true;
+ outV.noNulls = false;
+ expr = new StringSubstrColStart(0, 2, 1);
+ batch.size = 1;
+ expr.evaluate(batch);
+ outCol = (BytesColumnVector) batch.cols[1];
+ Assert.assertFalse(outV.isRepeating);
+ Assert.assertTrue(outV.noNulls);
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ // 3nd char starts from index 3 and total length should be 7 bytes as max is 10
+ multiByte, 3, 10 - 3, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+
+
+ // Testing multiByte string with reference starting mid array
+
+ v = new BytesColumnVector();
+ v.isRepeating = false;
+ v.noNulls = true;
+ v.setRef(0, multiByte, 3, 10);
+ batch.cols[0] = v;
+ batch.cols[1] = outV;
+ outV.isRepeating = true;
+ outV.noNulls = false;
+ outCol = (BytesColumnVector) batch.cols[1];
+ expr = new StringSubstrColStart(0, 1, 1);
+ expr.evaluate(batch);
+ Assert.assertFalse(outV.isRepeating);
+ Assert.assertTrue(outV.noNulls);
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ // Since references starts at index 3 (2nd char), substring with start idx 1
+ // will start at the 3rd char which starts at index 6
+ multiByte, 6, 10 - 6, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+ }
+
+ @Test
+ public void testSubstrStartLen() throws UnsupportedEncodingException {
+ // Testing no nulls and no repeating
+
+ VectorizedRowBatch batch = new VectorizedRowBatch(2);
+ BytesColumnVector v = new BytesColumnVector();
+ batch.cols[0] = v;
+ BytesColumnVector outV = new BytesColumnVector();
+ batch.cols[1] = outV;
+ byte[] data1 = "abcd string".getBytes("UTF-8");
+ byte[] data2 = "efgh string".getBytes("UTF-8");
+ byte[] data3 = "efgh".getBytes("UTF-8");
+ batch.size = 3;
+ v.noNulls = true;
+ v.setRef(0, data1, 0, data1.length);
+ v.isNull[0] = false;
+ v.setRef(1, data2, 0, data2.length);
+ v.isNull[1] = false;
+ v.setRef(2, data3, 0, data3.length);
+ v.isNull[2] = false;
+
+ outV.isRepeating = true;
+ outV.noNulls = false;
+
+ StringSubstrColStartLen expr = new StringSubstrColStartLen(0, 5, 6, 1);
+ expr.evaluate(batch);
+ BytesColumnVector outCol = (BytesColumnVector) batch.cols[1];
+ Assert.assertEquals(3, batch.size);
+ Assert.assertTrue(outCol.noNulls);
+ Assert.assertFalse(outCol.isRepeating);
+ byte[] expected = "string".getBytes("UTF-8");
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+ )
+ );
+
+ // Testing negative substring index
+ outV.isRepeating = true;
+ outV.noNulls = false;
+
+ expr = new StringSubstrColStartLen(0, -6, 6, 1);
+ expr.evaluate(batch);
+ outCol = (BytesColumnVector) batch.cols[1];
+ Assert.assertTrue(outCol.noNulls);
+ Assert.assertFalse(outCol.isRepeating);
+ Assert.assertEquals(3, batch.size);
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+ )
+ );
+
+ // This yields empty because starting index is out of bounds
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+ )
+ );
+
+ //Testing substring index starting with 0 and length equal to array length
+
+ outV.isRepeating = true;
+ outV.noNulls = false;
+
+ expr = new StringSubstrColStartLen(0, 0, 11, 1);
+ outCol = (BytesColumnVector) batch.cols[1];
+ expr.evaluate(batch);
+ Assert.assertEquals(3, batch.size);
+ Assert.assertTrue(outCol.noNulls);
+ Assert.assertFalse(outCol.isRepeating);
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ data1, 0, data1.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ data2, 0, data2.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ data3, 0, data3.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+ )
+ );
+
+
+ // Testing setting length larger than array length, which should cap to the length itself
+
+ outV.isRepeating = true;
+ outV.noNulls = false;
+
+ expr = new StringSubstrColStartLen(0, 5, 10, 1);
+ expr.evaluate(batch);
+ outCol = (BytesColumnVector) batch.cols[1];
+ Assert.assertEquals(3, batch.size);
+ Assert.assertTrue(outCol.noNulls);
+ Assert.assertFalse(outCol.isRepeating);
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+ )
+ );
+
+ outV.isRepeating = true;
+ outV.noNulls = true;
+
+ // Testing with nulls
+
+ v.noNulls = false;
+ v.isNull[0] = true;
+ expr.evaluate(batch);
+ Assert.assertEquals(3, batch.size);
+ Assert.assertFalse(outV.noNulls);
+ Assert.assertTrue(outV.isNull[0]);
+ Assert.assertFalse(outCol.isRepeating);
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+ )
+ );
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+ )
+ );
+
+
+ // Testing with repeating and no nulls
+ outV = new BytesColumnVector();
+ v = new BytesColumnVector();
+ outV.isRepeating = false;
+ outV.noNulls = true;
+ v.isRepeating = true;
+ v.noNulls = false;
+ v.setRef(0, data1, 0, data1.length);
+ batch = new VectorizedRowBatch(2);
+ batch.cols[0] = v;
+ batch.cols[1] = outV;
+ expr.evaluate(batch);
+ outCol = (BytesColumnVector) batch.cols[1];
+ Assert.assertTrue(outCol.noNulls);
+ Assert.assertTrue(outCol.isRepeating);
+
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+
+ // Testing with multiByte String
+ v = new BytesColumnVector();
+ v.isRepeating = false;
+ v.noNulls = true;
+ batch.size = 1;
+ v.setRef(0, multiByte, 0, 10);
+ batch.cols[0] = v;
+ batch.cols[1] = outV;
+ outV.isRepeating = true;
+ outV.noNulls = false;
+ expr = new StringSubstrColStartLen(0, 2, 2, 1);
+ expr.evaluate(batch);
+ Assert.assertEquals(1, batch.size);
+ Assert.assertFalse(outV.isRepeating);
+ Assert.assertTrue(outV.noNulls);
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ // 3rd char starts at index 3, and with length 2 it is covering the rest of the array.
+ multiByte, 3, 10 - 3, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+
+ // Testing multiByte string with reference set to mid array
+ v = new BytesColumnVector();
+ v.isRepeating = false;
+ v.noNulls = true;
+ outV = new BytesColumnVector();
+ batch.size = 1;
+ v.setRef(0, multiByte, 3, 7);
+ batch.cols[0] = v;
+ batch.cols[1] = outV;
+ outV.isRepeating = true;
+ outV.noNulls = false;
+ expr = new StringSubstrColStartLen(0, 1, 2, 1);
+ expr.evaluate(batch);
+ outCol = (BytesColumnVector) batch.cols[1];
+ Assert.assertEquals(1, batch.size);
+ Assert.assertFalse(outV.isRepeating);
+ Assert.assertTrue(outV.noNulls);
+ Assert.assertEquals(0,
+ StringExpr.compare(
+ // 2nd substring index refers to the 6th index (last char in the array)
+ multiByte, 6, 10 - 6, outCol.vector[0], outCol.start[0], outCol.length[0]
+ )
+ );
+ }
+ }