You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/07/02 00:26:52 UTC

svn commit: r1498726 - in /hive/branches/vectorization/ql/src: java/org/apache/hadoop/hive/ql/exec/vector/expressions/ test/org/apache/hadoop/hive/ql/exec/vector/expressions/

Author: hashutosh
Date: Mon Jul  1 22:26:51 2013
New Revision: 1498726

URL: http://svn.apache.org/r1498726
Log:
HIVE-4495 : Implement vectorized string substr (Timothy Chen via Ashutosh Chauhan)

Added:
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java
    hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
Modified:
    hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java?rev=1498726&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java Mon Jul  1 22:26:51 2013
@@ -0,0 +1,177 @@
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.io.UnsupportedEncodingException;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+/**
+ * This class provides the implementation of vectorized substring, with a single start index parameter.
+ * If the start index is invalid (outside of the string boundaries) then an empty string will be in the output.
+ */
+public class StringSubstrColStart extends VectorExpression {
+  private final int startIdx;
+  private final int colNum;
+  private final int outputColumn;
+  private static byte[] EMPTY_STRING;
+
+  // Populating the Empty string bytes. Putting it as static since it should be immutable and can be shared
+  static {
+    try {
+      EMPTY_STRING = "".getBytes("UTF-8");
+    } catch(UnsupportedEncodingException e) {
+      e.printStackTrace();
+    }
+  }
+
+  public StringSubstrColStart(int colNum, int startIdx, int outputColumn) {
+    this.colNum = colNum;
+    this.startIdx = startIdx;
+    this.outputColumn = outputColumn;
+  }
+
+  /**
+   * Given the substring start index param it finds the starting offset of the passed in utf8 string byte array
+   * that matches the index.
+   * @param utf8String byte array that holds the utf8 string
+   * @param start start offset of the byte array the string starts at
+   * @param len length of the bytes the string holds in the byte array
+   * @param substrStart the Start index for the substring operation
+   */
+  static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
+    int curIdx = -1;
+
+    if (substrStart < 0) {
+      int length = 0;
+      for (int i = start; i != len; ++i) {
+        if ((utf8String[i] & 0xc0) != 0x80) {
+          ++length;
+        }
+      }
+
+      if (-length > substrStart) {
+        return -1;
+      }
+
+      substrStart = length + substrStart;
+    }
+
+    int end = start + len;
+    for (int i = start; i != end; ++i) {
+      if ((utf8String[i] & 0xc0) != 0x80) {
+        ++curIdx;
+        if (curIdx == substrStart) {
+          return i;
+        }
+      }
+    }
+    return -1;
+  }
+
+  @Override
+  public void evaluate(VectorizedRowBatch batch) {
+    if (childExpressions != null) {
+      super.evaluateChildren(batch);
+    }
+
+    BytesColumnVector inV = (BytesColumnVector) batch.cols[colNum];
+    BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn];
+
+    int n = batch.size;
+
+    if (n == 0) {
+      return;
+    }
+
+
+    byte[][] vector = inV.vector;
+    int[] sel = batch.selected;
+    int[] len = inV.length;
+    int[] start = inV.start;
+
+    if (inV.isRepeating) {
+      outV.isRepeating = true;
+      if (!inV.noNulls && inV.isNull[0]) {
+        outV.isNull[0] = true;
+        outV.noNulls = false;
+        outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length);
+        return;
+      } else {
+        outV.noNulls = true;
+        int offset = getSubstrStartOffset(vector[0], sel[0], len[0], startIdx);
+        if (offset != -1) {
+          outV.setRef(0, vector[0], offset, len[0] - offset);
+        } else {
+          outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length);
+        }
+      }
+    } else {
+      outV.isRepeating = false;
+      if (batch.selectedInUse) {
+        if (!inV.noNulls) {
+          outV.noNulls = false;
+          for (int i = 0; i != n; ++i) {
+            int selected = sel[i];
+            if (!inV.isNull[selected]) {
+              int offset = getSubstrStartOffset(vector[selected], start[selected], len[selected], startIdx);
+              outV.isNull[selected] = false;
+              if (offset != -1) {
+                outV.setRef(selected, vector[selected], offset, len[selected] - offset);
+              } else {
+                outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length);
+              }
+            } else {
+              outV.isNull[selected] = true;
+            }
+          }
+        } else {
+          outV.noNulls = true;
+          for (int i = 0; i != n; ++i) {
+            int selected = sel[i];
+            int offset = getSubstrStartOffset(vector[selected], start[selected], len[selected], startIdx);
+            if (offset != -1) {
+              outV.setRef(selected, vector[selected], offset, len[selected] - offset);
+            } else {
+              outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length);
+            }
+          }
+        }
+      } else {
+        if (!inV.noNulls) {
+          outV.noNulls = false;
+          System.arraycopy(inV.isNull, 0, outV.isNull, 0, n);
+          for (int i = 0; i != n; ++i) {
+            if (!inV.isNull[i]) {
+              int offset = getSubstrStartOffset(vector[i], start[i], len[i], startIdx);
+              if (offset != -1) {
+                outV.setRef(i, vector[i], offset, len[i] - offset);
+              } else {
+                outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length);
+              }
+            }
+          }
+        } else {
+          outV.noNulls = true;
+          for (int i = 0; i != n; ++i) {
+            int offset = getSubstrStartOffset(vector[i], start[i], len[i], startIdx);
+            if (offset != -1) {
+              outV.setRef(i, vector[i], offset, len[i] - offset);
+            } else {
+              outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  @Override
+  public int getOutputColumn() {
+    return outputColumn;
+  }
+
+  @Override
+  public String getOutputType() {
+    return "string";
+  }
+}

Added: hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java?rev=1498726&view=auto
==============================================================================
--- hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java (added)
+++ hive/branches/vectorization/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java Mon Jul  1 22:26:51 2013
@@ -0,0 +1,195 @@
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import java.io.UnsupportedEncodingException;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+
+/**
+ * This class provides the implementation of vectorized substring, with a start index and length parameters.
+ * If the start index is invalid (outside of the string boundaries) then an empty string will be in the output.
+ * If the length provided is longer then the string boundary, then it will replace it with the ending index.
+ */
+public class StringSubstrColStartLen extends VectorExpression {
+  private final int startIdx;
+  private final int colNum;
+  private final int length;
+  private final int outputColumn;
+  private final int[] offsetArray;
+  private static byte[] EMPTY_STRING;
+
+  // Populating the Empty string bytes. Putting it as static since it should be immutable and can be shared
+  static {
+    try {
+      EMPTY_STRING = "".getBytes("UTF-8");
+    } catch(UnsupportedEncodingException e) {
+      e.printStackTrace();
+    }
+  }
+
+  public StringSubstrColStartLen(int colNum, int startIdx, int length, int outputColumn) {
+    this.colNum = colNum;
+    this.startIdx = startIdx;
+    this.length = length;
+    this.outputColumn = outputColumn;
+    offsetArray = new int[2];
+  }
+
+  /**
+   * Populates the substring start and end offsets based on the substring start and length params.
+   *
+   * @param utf8String byte array that holds the utf8 string
+   * @param start start offset of the byte array the string starts at
+   * @param len length of the bytes the string holds in the byte array
+   * @param substrStart the Start index for the substring operation
+   * @param substrLen the length of the substring
+   * @param offsetArray the array that indexes are populated to. Assume its length >= 2.
+   */
+  static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart, int substrLength, int[] offsetArray) {
+    int curIdx = -1;
+    offsetArray[0] = -1;
+    offsetArray[1] = -1;
+
+    if (substrStart < 0) {
+      int length = 0;
+      for (int i = start; i != len; ++i) {
+        if ((utf8String[i] & 0xc0) != 0x80) {
+          ++length;
+        }
+      }
+
+      if (-length > substrStart) {
+        return;
+      }
+
+      substrStart = length + substrStart;
+    }
+
+
+    int endIdx = substrStart + substrLength - 1;
+    int end = start + len;
+    for (int i = start; i != end; ++i) {
+      if ((utf8String[i] & 0xc0) != 0x80) {
+        ++curIdx;
+        if (curIdx == substrStart) {
+          offsetArray[0] = i;
+        } else if (curIdx - 1 == endIdx) {
+          offsetArray[1] = i - offsetArray[0];
+        }
+      }
+    }
+
+    if (offsetArray[1] == -1) {
+      offsetArray[1] = end - offsetArray[0];
+    }
+  }
+
+  @Override
+  public void evaluate(VectorizedRowBatch batch) {
+    if (childExpressions != null) {
+      super.evaluateChildren(batch);
+    }
+
+    BytesColumnVector inV = (BytesColumnVector) batch.cols[colNum];
+    BytesColumnVector outV = (BytesColumnVector) batch.cols[outputColumn];
+
+    int n = batch.size;
+
+    if (n == 0) {
+      return;
+    }
+
+    byte[][] vector = inV.vector;
+    int[] sel = batch.selected;
+    int[] len = inV.length;
+    int[] start = inV.start;
+
+    if (inV.isRepeating) {
+      outV.isRepeating = true;
+      if (!inV.noNulls && inV.isNull[0]) {
+        outV.isNull[0] = true;
+        outV.noNulls = false;
+        outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length);
+        return;
+      } else {
+        outV.noNulls = true;
+        populateSubstrOffsets(vector[0], sel[0], len[0], startIdx, length, offsetArray);
+        if (offsetArray[0] != -1) {
+          outV.setRef(0, vector[0], offsetArray[0], offsetArray[1]);
+        } else {
+          outV.setRef(0, EMPTY_STRING, 0, EMPTY_STRING.length);
+        }
+      }
+    } else {
+      outV.isRepeating = false;
+      if (batch.selectedInUse) {
+        if (!inV.noNulls) {
+          outV.noNulls = false;
+          for (int i = 0; i != n; ++i) {
+            int selected = sel[i];
+            if (!inV.isNull[selected]) {
+              outV.isNull[selected] = false;
+              populateSubstrOffsets(vector[selected], start[selected], len[selected], startIdx, length, offsetArray);
+              if (offsetArray[0] != -1) {
+                outV.setRef(selected, vector[selected], offsetArray[0], offsetArray[1]);
+              } else {
+                outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length);
+              }
+            } else {
+              outV.isNull[selected] = true;
+            }
+          }
+        } else {
+          outV.noNulls = true;
+          for (int i = 0; i != n; ++i) {
+            int selected = sel[i];
+            outV.isNull[selected] = false;
+            populateSubstrOffsets(vector[selected], start[selected], len[selected], startIdx, length, offsetArray);
+            if (offsetArray[0] != -1) {
+              outV.setRef(selected, vector[selected], offsetArray[0], offsetArray[1]);
+            } else {
+              outV.setRef(selected, EMPTY_STRING, 0, EMPTY_STRING.length);
+            }
+          }
+        }
+      } else {
+        if (!inV.noNulls) {
+          System.arraycopy(inV.isNull, 0, outV.isNull, 0, n);
+          outV.noNulls = false;
+          for (int i = 0; i != n; ++i) {
+            if (!inV.isNull[i]) {
+              populateSubstrOffsets(vector[i], start[i], len[i], startIdx, length, offsetArray);
+              if (offsetArray[0] != -1) {
+                outV.setRef(i, vector[i], offsetArray[0], offsetArray[1]);
+              } else {
+                outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length);
+              }
+            }
+          }
+        } else {
+          outV.noNulls = true;
+          for (int i = 0; i != n; ++i) {
+            outV.isNull[i] = false;
+            populateSubstrOffsets(vector[i], start[i], len[i], startIdx, length, offsetArray);
+            if (offsetArray[0] != -1) {
+              outV.setRef(i, vector[i], offsetArray[0], offsetArray[1]);
+            } else {
+              outV.setRef(i, EMPTY_STRING, 0, EMPTY_STRING.length);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  @Override
+  public int getOutputColumn() {
+    return outputColumn;
+  }
+
+  @Override
+  public String getOutputType() {
+    return "string";
+  }
+
+}

Modified: hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
URL: http://svn.apache.org/viewvc/hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java?rev=1498726&r1=1498725&r2=1498726&view=diff
==============================================================================
--- hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java (original)
+++ hive/branches/vectorization/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java Mon Jul  1 22:26:51 2013
@@ -18,21 +18,20 @@
 
 package org.apache.hadoop.hive.ql.exec.vector.expressions;
 
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+
 import junit.framework.Assert;
 
 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColEqualStringScalar;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColLessStringScalar;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColGreaterEqualStringScalar;
 import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColLessStringCol;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.
-  FilterStringColGreaterEqualStringScalar;
-import org.junit.Test;
-
-import java.io.UnsupportedEncodingException;
-import java.util.Arrays;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.FilterStringColLessStringScalar;
 import org.apache.hadoop.io.Text;
+import org.junit.Test;
 
 /**
  * Test vectorized expression and filter evaluation for strings.
@@ -167,79 +166,79 @@ public class TestVectorStringExpressions
     Assert.assertTrue(batch.selected[0] == 0);
     Assert.assertTrue(batch.selected[1] == 1);
   }
-  
+
   @Test
   public void testStringColCompareStringColFilter() {
     VectorizedRowBatch batch;
     VectorExpression expr;
-    
+
     /* input data
-     * 
+     *
      * col0       col1
      * ===============
      * blue       red
      * green      green
      * red        blue
      * NULL       red            col0 data is empty string if we un-set NULL property
-     */    
-    
+     */
+
     // nulls possible on left, right
     batch = makeStringBatchForColColCompare();
     expr = new FilterStringColLessStringCol(0,1);
     expr.evaluate(batch);
     Assert.assertEquals(1, batch.size);
     Assert.assertEquals(0, batch.selected[0]);
-    
+
     // no nulls possible
     batch = makeStringBatchForColColCompare();
     batch.cols[0].noNulls = true;
     batch.cols[1].noNulls = true;
     expr.evaluate(batch);
     Assert.assertEquals(2, batch.size);
-    Assert.assertEquals(3, batch.selected[1]);    
-    
+    Assert.assertEquals(3, batch.selected[1]);
+
     // nulls on left, no nulls on right
     batch = makeStringBatchForColColCompare();
     batch.cols[1].noNulls = true;
     expr.evaluate(batch);
     Assert.assertEquals(1, batch.size);
     Assert.assertEquals(0, batch.selected[0]);
-    
+
     // nulls on right, no nulls on left
     batch = makeStringBatchForColColCompare();
     batch.cols[0].noNulls = true;
     batch.cols[1].isNull[3] = true;
     expr.evaluate(batch);
     Assert.assertEquals(1, batch.size);
-    Assert.assertEquals(0, batch.selected[0]);    
-    
+    Assert.assertEquals(0, batch.selected[0]);
+
     // Now vary isRepeating
     // nulls possible on left, right
-    
+
     // left repeats
     batch = makeStringBatchForColColCompare();
     batch.cols[0].isRepeating = true;
     expr.evaluate(batch);
     Assert.assertEquals(3, batch.size);
     Assert.assertEquals(3, batch.selected[2]);
-    
+
     // right repeats
     batch = makeStringBatchForColColCompare();
     batch.cols[1].isRepeating = true;
     expr.evaluate(batch);
     Assert.assertEquals(2, batch.size); // first 2 qualify
     Assert.assertEquals(1, batch.selected[1]);
-    
+
     // left and right repeat
     batch = makeStringBatchForColColCompare();
     batch.cols[0].isRepeating = true;
     batch.cols[1].isRepeating = true;
     expr.evaluate(batch);
     Assert.assertEquals(4, batch.size);
-    
+
     // Now vary isRepeating
     // nulls possible only on left
-    
+
     // left repeats
     batch = makeStringBatchForColColCompare();
     batch.cols[0].isRepeating = true;
@@ -247,7 +246,7 @@ public class TestVectorStringExpressions
     expr.evaluate(batch);
     Assert.assertEquals(3, batch.size);
     Assert.assertEquals(3, batch.selected[2]);
-    
+
     // left repeats and is null
     batch = makeStringBatchForColColCompare();
     batch.cols[0].isRepeating = true;
@@ -255,15 +254,15 @@ public class TestVectorStringExpressions
     batch.cols[0].isNull[0] = true;
     expr.evaluate(batch);
     Assert.assertEquals(0, batch.size);
-    
+
     // right repeats
     batch = makeStringBatchForColColCompare();
     batch.cols[1].isRepeating = true;
     batch.cols[1].noNulls = true;
     expr.evaluate(batch);
-    Assert.assertEquals(3, batch.size); 
+    Assert.assertEquals(3, batch.size);
     Assert.assertEquals(1, batch.selected[1]);
-    
+
     // left and right repeat
     batch = makeStringBatchForColColCompare();
     batch.cols[0].isRepeating = true;
@@ -272,10 +271,10 @@ public class TestVectorStringExpressions
     expr.evaluate(batch);
     Assert.assertEquals(4, batch.size);
 
-    
+
     // Now vary isRepeating
     // nulls possible only on right
-    
+
     // left repeats
     batch = makeStringBatchForColColCompare();
     batch.cols[0].isRepeating = true;
@@ -284,22 +283,22 @@ public class TestVectorStringExpressions
     expr.evaluate(batch);
     Assert.assertEquals(2, batch.size);
     Assert.assertEquals(3, batch.selected[1]);
-    
+
     // right repeats
     batch = makeStringBatchForColColCompare();
     batch.cols[1].isRepeating = true;
     batch.cols[0].noNulls = true;
     expr.evaluate(batch);
-    Assert.assertEquals(3, batch.size); 
+    Assert.assertEquals(3, batch.size);
     Assert.assertEquals(3, batch.selected[2]);
-    
+
     // right repeats and is null
     batch = makeStringBatchForColColCompare();
     batch.cols[1].isRepeating = true;
     batch.cols[0].noNulls = true;
     batch.cols[1].isNull[0] = true;
     expr.evaluate(batch);
-    Assert.assertEquals(0, batch.size); 
+    Assert.assertEquals(0, batch.size);
 
     // left and right repeat
     batch = makeStringBatchForColColCompare();
@@ -308,7 +307,7 @@ public class TestVectorStringExpressions
     batch.cols[0].noNulls = true;
     expr.evaluate(batch);
     Assert.assertEquals(4, batch.size);
-    
+
     // left and right repeat and right is null
     batch = makeStringBatchForColColCompare();
     batch.cols[0].isRepeating = true;
@@ -316,7 +315,7 @@ public class TestVectorStringExpressions
     batch.cols[0].noNulls = true;
     batch.cols[1].isNull[0] = true;
     expr.evaluate(batch);
-    Assert.assertEquals(0, batch.size);  
+    Assert.assertEquals(0, batch.size);
   }
 
   VectorizedRowBatch makeStringBatch() {
@@ -537,7 +536,7 @@ public class TestVectorStringExpressions
     batch.size = 3;
     return batch;
   }
-  
+
   private VectorizedRowBatch makeStringBatchForColColCompare() {
     VectorizedRowBatch batch = new VectorizedRowBatch(3);
     BytesColumnVector v = new BytesColumnVector();
@@ -565,10 +564,10 @@ public class TestVectorStringExpressions
     v2.setRef(3, red, 0, red.length);
     v2.isNull[3] = false;
     v2.noNulls = false;
-    
+
     batch.size = 4;
     return batch;
-  }  
+  }
 
   @Test
   public void testStringLike() {
@@ -938,4 +937,438 @@ public class TestVectorStringExpressions
         outCol.start[0], outCol.length[0]);
     Assert.assertEquals(0, cmp);
   }
-}
+
+  @Test
+  public void testSubstrStart() throws UnsupportedEncodingException {
+    // Testing no nulls and no repeating
+    VectorizedRowBatch batch = new VectorizedRowBatch(2);
+    BytesColumnVector v = new BytesColumnVector();
+    batch.cols[0] = v;
+    BytesColumnVector outV = new BytesColumnVector();
+    batch.cols[1] = outV;
+    byte[] data1 = "abcd string".getBytes("UTF-8");
+    byte[] data2 = "efgh string".getBytes("UTF-8");
+    byte[] data3 = "efgh".getBytes("UTF-8");
+    batch.size = 3;
+    v.noNulls = true;
+    v.setRef(0, data1, 0, data1.length);
+    v.isNull[0] = false;
+    v.setRef(1, data2, 0, data2.length);
+    v.isNull[1] = false;
+    v.setRef(2, data3, 0, data3.length);
+    v.isNull[2] = false;
+
+    StringSubstrColStart expr = new StringSubstrColStart(0, 5, 1);
+    expr.evaluate(batch);
+    BytesColumnVector outCol = (BytesColumnVector) batch.cols[1];
+    Assert.assertEquals(3, batch.size);
+    Assert.assertTrue(outCol.noNulls);
+    Assert.assertFalse(outCol.isRepeating);
+    byte[] expected = "string".getBytes("UTF-8");
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+        )
+    );
+
+    // This yields empty because starting idx is out of bounds.
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+        )
+    );
+
+    outCol.noNulls = false;
+    outCol.isRepeating = true;
+
+    // Testing negative substring index.
+    // For a string with length 11, start idx 5 should yield same results as -6
+
+    expr = new StringSubstrColStart(0, -6, 1);
+    expr.evaluate(batch);
+    outCol = (BytesColumnVector) batch.cols[1];
+    Assert.assertEquals(3, batch.size);
+    Assert.assertTrue(outCol.noNulls);
+    Assert.assertFalse(outCol.isRepeating);
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+        )
+    );
+
+    outCol.noNulls = false;
+    outCol.isRepeating = true;
+
+    // Testing substring starting from index 0
+
+    expr = new StringSubstrColStart(0, 0, 1);
+    expr.evaluate(batch);
+    Assert.assertEquals(3, batch.size);
+    Assert.assertTrue(outCol.noNulls);
+    Assert.assertFalse(outCol.isRepeating);
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            data1, 0, data1.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            data2, 0, data2.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            data3, 0, data3.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+        )
+    );
+
+    outV.noNulls = false;
+    outV.isRepeating = true;
+
+    // Testing with nulls
+
+    expr = new StringSubstrColStart(0, 5, 1);
+    v.noNulls = false;
+    v.isNull[0] = true;
+    expr.evaluate(batch);
+    Assert.assertEquals(3, batch.size);
+    Assert.assertFalse(outV.noNulls);
+    Assert.assertTrue(outV.isNull[0]);
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+        )
+    );
+
+    outCol.noNulls = false;
+    outCol.isRepeating = false;
+
+    // Testing with repeating and no nulls
+
+    outV = new BytesColumnVector();
+    v = new BytesColumnVector();
+    v.isRepeating = true;
+    v.noNulls = true;
+    v.setRef(0, data1, 0, data1.length);
+    batch = new VectorizedRowBatch(2);
+    batch.cols[0] = v;
+    batch.cols[1] = outV;
+    expr.evaluate(batch);
+    outCol = (BytesColumnVector) batch.cols[1];
+    expected = "string".getBytes("UTF-8");
+    Assert.assertTrue(outV.isRepeating);
+    Assert.assertTrue(outV.noNulls);
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+
+    // Testing multiByte string substring
+
+    v = new BytesColumnVector();
+    v.isRepeating = false;
+    v.noNulls = true;
+    v.setRef(0, multiByte, 0, 10);
+    batch.cols[0] = v;
+    batch.cols[1] = outV;
+    outV.isRepeating = true;
+    outV.noNulls = false;
+    expr = new StringSubstrColStart(0, 2, 1);
+    batch.size = 1;
+    expr.evaluate(batch);
+    outCol = (BytesColumnVector) batch.cols[1];
+    Assert.assertFalse(outV.isRepeating);
+    Assert.assertTrue(outV.noNulls);
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            // 3nd char starts from index 3 and total length should be 7 bytes as max is 10
+            multiByte, 3, 10 - 3, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+
+
+    // Testing multiByte string with reference starting mid array
+
+    v = new BytesColumnVector();
+    v.isRepeating = false;
+    v.noNulls = true;
+    v.setRef(0, multiByte, 3, 10);
+    batch.cols[0] = v;
+    batch.cols[1] = outV;
+    outV.isRepeating = true;
+    outV.noNulls = false;
+    outCol = (BytesColumnVector) batch.cols[1];
+    expr = new StringSubstrColStart(0, 1, 1);
+    expr.evaluate(batch);
+    Assert.assertFalse(outV.isRepeating);
+    Assert.assertTrue(outV.noNulls);
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            // Since references starts at index 3 (2nd char), substring with start idx 1
+            // will start at the 3rd char which starts at index 6
+            multiByte, 6, 10 - 6, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+  }
+
+  @Test
+  public void testSubstrStartLen() throws UnsupportedEncodingException {
+    // Testing no nulls and no repeating
+
+    VectorizedRowBatch batch = new VectorizedRowBatch(2);
+    BytesColumnVector v = new BytesColumnVector();
+    batch.cols[0] = v;
+    BytesColumnVector outV = new BytesColumnVector();
+    batch.cols[1] = outV;
+    byte[] data1 = "abcd string".getBytes("UTF-8");
+    byte[] data2 = "efgh string".getBytes("UTF-8");
+    byte[] data3 = "efgh".getBytes("UTF-8");
+    batch.size = 3;
+    v.noNulls = true;
+    v.setRef(0, data1, 0, data1.length);
+    v.isNull[0] = false;
+    v.setRef(1, data2, 0, data2.length);
+    v.isNull[1] = false;
+    v.setRef(2, data3, 0, data3.length);
+    v.isNull[2] = false;
+
+    outV.isRepeating = true;
+    outV.noNulls = false;
+
+    StringSubstrColStartLen expr = new StringSubstrColStartLen(0, 5, 6, 1);
+    expr.evaluate(batch);
+    BytesColumnVector outCol = (BytesColumnVector) batch.cols[1];
+    Assert.assertEquals(3, batch.size);
+    Assert.assertTrue(outCol.noNulls);
+    Assert.assertFalse(outCol.isRepeating);
+    byte[] expected = "string".getBytes("UTF-8");
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+        )
+    );
+
+    // Testing negative substring index
+    outV.isRepeating = true;
+    outV.noNulls = false;
+
+    expr = new StringSubstrColStartLen(0, -6, 6, 1);
+    expr.evaluate(batch);
+    outCol = (BytesColumnVector) batch.cols[1];
+    Assert.assertTrue(outCol.noNulls);
+    Assert.assertFalse(outCol.isRepeating);
+    Assert.assertEquals(3, batch.size);
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+
+    Assert.assertEquals(0,
+        StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+        )
+    );
+
+    // This yields empty because starting index is out of bounds
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+        )
+    );
+
+    //Testing substring index starting with 0 and length equal to array length
+
+    outV.isRepeating = true;
+    outV.noNulls = false;
+
+    expr = new StringSubstrColStartLen(0, 0, 11, 1);
+    outCol = (BytesColumnVector) batch.cols[1];
+    expr.evaluate(batch);
+    Assert.assertEquals(3, batch.size);
+    Assert.assertTrue(outCol.noNulls);
+    Assert.assertFalse(outCol.isRepeating);
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            data1, 0, data1.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            data2, 0, data2.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            data3, 0, data3.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+        )
+    );
+
+
+    // Testing setting length larger than array length, which should cap to the length itself
+
+    outV.isRepeating = true;
+    outV.noNulls = false;
+
+    expr = new StringSubstrColStartLen(0, 5, 10, 1);
+    expr.evaluate(batch);
+    outCol = (BytesColumnVector) batch.cols[1];
+    Assert.assertEquals(3, batch.size);
+    Assert.assertTrue(outCol.noNulls);
+    Assert.assertFalse(outCol.isRepeating);
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+        )
+    );
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+        )
+    );
+
+    outV.isRepeating = true;
+    outV.noNulls = true;
+
+    // Testing with nulls
+
+    v.noNulls = false;
+    v.isNull[0] = true;
+    expr.evaluate(batch);
+    Assert.assertEquals(3, batch.size);
+    Assert.assertFalse(outV.noNulls);
+    Assert.assertTrue(outV.isNull[0]);
+    Assert.assertFalse(outCol.isRepeating);
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[1], outCol.start[1], outCol.length[1]
+        )
+    );
+
+    Assert.assertEquals(0,
+        StringExpr.compare(
+            emptyString, 0, emptyString.length, outCol.vector[2], outCol.start[2], outCol.length[2]
+        )
+    );
+
+
+    // Testing with repeating and no nulls
+    outV = new BytesColumnVector();
+    v = new BytesColumnVector();
+    outV.isRepeating = false;
+    outV.noNulls = true;
+    v.isRepeating = true;
+    v.noNulls = false;
+    v.setRef(0, data1, 0, data1.length);
+    batch = new VectorizedRowBatch(2);
+    batch.cols[0] = v;
+    batch.cols[1] = outV;
+    expr.evaluate(batch);
+    outCol = (BytesColumnVector) batch.cols[1];
+    Assert.assertTrue(outCol.noNulls);
+    Assert.assertTrue(outCol.isRepeating);
+
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            expected, 0, expected.length, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+
+    // Testing with multiByte String
+    v = new BytesColumnVector();
+    v.isRepeating = false;
+    v.noNulls = true;
+    batch.size = 1;
+    v.setRef(0, multiByte, 0, 10);
+    batch.cols[0] = v;
+    batch.cols[1] = outV;
+    outV.isRepeating = true;
+    outV.noNulls = false;
+    expr = new StringSubstrColStartLen(0, 2, 2, 1);
+    expr.evaluate(batch);
+    Assert.assertEquals(1, batch.size);
+    Assert.assertFalse(outV.isRepeating);
+    Assert.assertTrue(outV.noNulls);
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            // 3rd char starts at index 3, and with length 2 it is covering the rest of the array.
+            multiByte, 3, 10 - 3, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+
+    // Testing multiByte string with reference set to mid array
+    v = new BytesColumnVector();
+    v.isRepeating = false;
+    v.noNulls = true;
+    outV = new BytesColumnVector();
+    batch.size = 1;
+    v.setRef(0, multiByte, 3, 7);
+    batch.cols[0] = v;
+    batch.cols[1] = outV;
+    outV.isRepeating = true;
+    outV.noNulls = false;
+    expr = new StringSubstrColStartLen(0, 1, 2, 1);
+    expr.evaluate(batch);
+    outCol = (BytesColumnVector) batch.cols[1];
+    Assert.assertEquals(1, batch.size);
+    Assert.assertFalse(outV.isRepeating);
+    Assert.assertTrue(outV.noNulls);
+    Assert.assertEquals(0,
+    StringExpr.compare(
+            // 2nd substring index refers to the 6th index (last char in the array)
+            multiByte, 6, 10 - 6, outCol.vector[0], outCol.start[0], outCol.length[0]
+        )
+    );
+  }
+ }