You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/10/03 18:10:32 UTC

svn commit: r1528917 - in /hive/trunk/ql/src: java/org/apache/hadoop/hive/ql/exec/vector/expressions/ test/org/apache/hadoop/hive/ql/exec/vector/expressions/

Author: hashutosh
Date: Thu Oct  3 16:10:32 2013
New Revision: 1528917

URL: http://svn.apache.org/r1528917
Log:
HIVE-4642 : Implement vectorized RLIKE and REGEXP filter expressions (Teddy Choi via Ashutosh Chauhan)

Added:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java
Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
    hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java?rev=1528917&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java Thu Oct  3 16:10:32 2013
@@ -0,0 +1,412 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.io.Text;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * An abstract class for LIKE and REGEXP expressions. LIKE and REGEXP expression share similar
+ * functions, but they have different grammars. AbstractFilterStringColLikeStringScalar class
+ * provides shared classes and methods. Each subclass handles its grammar.
+ */
+public abstract class AbstractFilterStringColLikeStringScalar extends VectorExpression {
+  private static final long serialVersionUID = 1L;
+
+  private int colNum;
+  private String pattern;
+  transient Checker checker;
+
+  public AbstractFilterStringColLikeStringScalar() {
+    super();
+  }
+
+  public AbstractFilterStringColLikeStringScalar(int colNum, String pattern) {
+    this.colNum = colNum;
+    this.pattern = pattern;
+  }
+
+  protected abstract List<CheckerFactory> getCheckerFactories();
+
+  /**
+   * Selects an optimized checker for a given string.
+   * @param pattern
+   * @return
+   */
+  private Checker createChecker(String pattern) {
+    for (CheckerFactory checkerFactory : getCheckerFactories()) {
+      Checker checker = checkerFactory.tryCreate(pattern);
+      if (checker != null) {
+        return checker;
+      }
+    }
+    return null;
+  }
+
+  @Override
+  public void evaluate(VectorizedRowBatch batch) {
+    this.checker = createChecker(pattern);
+
+    if (childExpressions != null) {
+      super.evaluateChildren(batch);
+    }
+
+    BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
+    int[] sel = batch.selected;
+    boolean[] nullPos = inputColVector.isNull;
+    int n = batch.size;
+    byte[][] vector = inputColVector.vector;
+    int[] length = inputColVector.length;
+    int[] start = inputColVector.start;
+
+    // return immediately if batch is empty
+    if (n == 0) {
+      return;
+    }
+
+    if (inputColVector.noNulls) {
+      if (inputColVector.isRepeating) {
+
+        // All must be selected otherwise size would be zero Repeating property will not change.
+        if (!checker.check(vector[0], start[0], length[0])) {
+
+          // Entire batch is filtered out.
+          batch.size = 0;
+        }
+      } else if (batch.selectedInUse) {
+        int newSize = 0;
+
+        for (int j = 0; j != n; j++) {
+          int i = sel[j];
+          if (checker.check(vector[i], start[i], length[i])) {
+            sel[newSize++] = i;
+          }
+        }
+
+        batch.size = newSize;
+      } else {
+        int newSize = 0;
+        for (int i = 0; i != n; i++) {
+          if (checker.check(vector[i], start[i], length[i])) {
+            sel[newSize++] = i;
+          }
+        }
+
+        if (newSize < n) {
+          batch.size = newSize;
+          batch.selectedInUse = true;
+        }
+      }
+    } else {
+      if (inputColVector.isRepeating) {
+
+        //All must be selected otherwise size would be zero. Repeating property will not change.
+        if (!nullPos[0]) {
+          if (!checker.check(vector[0], start[0], length[0])) {
+
+            //Entire batch is filtered out.
+            batch.size = 0;
+          }
+        } else {
+          batch.size = 0;
+        }
+      } else if (batch.selectedInUse) {
+        int newSize = 0;
+
+        for (int j = 0; j != n; j++) {
+          int i = sel[j];
+          if (!nullPos[i]) {
+            if (checker.check(vector[i], start[i], length[i])) {
+              sel[newSize++] = i;
+            }
+          }
+        }
+
+        //Change the selected vector
+        batch.size = newSize;
+      } else {
+        int newSize = 0;
+
+        for (int i = 0; i != n; i++) {
+          if (!nullPos[i]) {
+            if (checker.check(vector[i], start[i], length[i])) {
+              sel[newSize++] = i;
+            }
+          }
+        }
+
+        if (newSize < n) {
+          batch.size = newSize;
+          batch.selectedInUse = true;
+        }
+
+        /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline
+         * future operations. So selectedInUse will remain false.
+         */
+      }
+    }
+  }
+
+  @Override
+  public int getOutputColumn() {
+    return -1;
+  }
+
+  @Override
+  public String getOutputType() {
+    return "boolean";
+  }
+
+  /**
+   * A Checker contains a pattern and checks whether a given string matches or not.
+   */
+  public interface Checker {
+    /**
+     * Checks whether the given string matches with its pattern.
+     * @param byteS The byte array that contains the string
+     * @param start The start position of the string
+     * @param len The length of the string
+     * @return Whether it matches or not.
+     */
+    boolean check(byte[] byteS, int start, int len);
+  }
+
+  /**
+   * A CheckerFactory creates checkers of its kind.
+   */
+  protected interface CheckerFactory {
+    /**
+     * If the given pattern is acceptable for its checker class, it creates and returns a checker.
+     * Otherwise, it returns <code>null</code>.
+     * @param pattern
+     * @return If the pattern is acceptable, a <code>Checker</code> object. Otherwise
+     * <code>null</code>.
+     */
+    Checker tryCreate(String pattern);
+  }
+
+  /**
+   * Matches the whole string to its pattern.
+   */
+  protected static class NoneChecker implements Checker {
+    byte [] byteSub;
+
+    NoneChecker(String pattern) {
+      try {
+        byteSub = pattern.getBytes("UTF-8");
+      } catch (UnsupportedEncodingException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    public boolean check(byte[] byteS, int start, int len) {
+      int lenSub = byteSub.length;
+      if (len != lenSub) {
+        return false;
+      }
+      for (int i = start, j = 0; j < len; i++, j++) {
+        if (byteS[i] != byteSub[j]) {
+          return false;
+        }
+      }
+      return true;
+    }
+  }
+
+  /**
+   * Matches the beginning of each string to a pattern.
+   */
+  protected static class BeginChecker implements Checker {
+    byte[] byteSub;
+
+    BeginChecker(String pattern) {
+      try {
+        byteSub = pattern.getBytes("UTF-8");
+      } catch (UnsupportedEncodingException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    public boolean check(byte[] byteS, int start, int len) {
+      if (len < byteSub.length) {
+        return false;
+      }
+      for (int i = start, j = 0; j < byteSub.length; i++, j++) {
+        if (byteS[i] != byteSub[j]) {
+          return false;
+        }
+      }
+      return true;
+    }
+  }
+
+  /**
+   * Matches the ending of each string to its pattern.
+   */
+  protected static class EndChecker implements Checker {
+    byte[] byteSub;
+    EndChecker(String pattern) {
+      try {
+        byteSub = pattern.getBytes("UTF-8");
+      } catch (UnsupportedEncodingException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    public boolean check(byte[] byteS, int start, int len) {
+      int lenSub = byteSub.length;
+      if (len < lenSub) {
+        return false;
+      }
+      for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) {
+        if (byteS[i] != byteSub[j]) {
+          return false;
+        }
+      }
+      return true;
+    }
+  }
+
+  /**
+   * Matches the middle of each string to its pattern.
+   */
+  protected static class MiddleChecker implements Checker {
+    byte[] byteSub;
+    int lenSub;
+
+    MiddleChecker(String pattern) {
+      try {
+        byteSub = pattern.getBytes("UTF-8");
+        lenSub = byteSub.length;
+      } catch (UnsupportedEncodingException e) {
+        throw new RuntimeException(e);
+      }
+    }
+
+    public boolean check(byte[] byteS, int start, int len) {
+      if (len < lenSub) {
+        return false;
+      }
+      int end = start + len - lenSub + 1;
+      boolean match = false;
+      for (int i = start; i < end; i++) {
+        match = true;
+        for (int j = 0; j < lenSub; j++) {
+          if (byteS[i + j] != byteSub[j]) {
+            match = false;
+            break;
+          }
+        }
+        if (match) {
+          return true;
+        }
+      }
+      return match;
+    }
+  }
+
+  /**
+   * Matches each string to a pattern with Java regular expression package.
+   */
+  protected static class ComplexChecker implements Checker {
+    Pattern compiledPattern;
+    Matcher matcher;
+    FastUTF8Decoder decoder;
+
+    ComplexChecker(String pattern) {
+      compiledPattern = Pattern.compile(pattern);
+      matcher = compiledPattern.matcher("");
+      decoder = new FastUTF8Decoder();
+    }
+
+    public boolean check(byte[] byteS, int start, int len) {
+      // Match the given bytes with the like pattern
+      matcher.reset(decoder.decodeUnsafely(byteS, start, len));
+      return matcher.matches();
+    }
+  }
+
+  /**
+   * A fast UTF-8 decoder that caches necessary objects for decoding.
+   */
+  private static class FastUTF8Decoder {
+    CharsetDecoder decoder;
+    ByteBuffer byteBuffer;
+    CharBuffer charBuffer;
+
+    public FastUTF8Decoder() {
+      decoder = Charset.forName("UTF-8").newDecoder()
+          .onMalformedInput(CodingErrorAction.REPLACE)
+          .onUnmappableCharacter(CodingErrorAction.REPLACE);
+      byteBuffer = ByteBuffer.allocate(4);
+      charBuffer = CharBuffer.allocate(4);
+    }
+
+    public CharBuffer decodeUnsafely(byte[] byteS, int start, int len) {
+      // Prepare buffers
+      if (byteBuffer.capacity() < len) {
+        byteBuffer = ByteBuffer.allocate(len * 2);
+      }
+      byteBuffer.clear();
+      byteBuffer.put(byteS, start, len);
+      byteBuffer.flip();
+
+      int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte());
+      if (charBuffer.capacity() < maxChars) {
+        charBuffer = CharBuffer.allocate(maxChars);
+      }
+      charBuffer.clear();
+
+      // Decode UTF-8
+      decoder.reset();
+      decoder.decode(byteBuffer, charBuffer, true);
+      decoder.flush(charBuffer);
+      charBuffer.flip();
+
+      return charBuffer;
+    }
+  }
+
+  public int getColNum() {
+    return colNum;
+  }
+
+  public void setColNum(int colNum) {
+    this.colNum = colNum;
+  }
+
+  public String getPattern() {
+    return pattern;
+  }
+
+  public void setPattern(String pattern) {
+    this.pattern = pattern;
+  }
+}

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java?rev=1528917&r1=1528916&r2=1528917&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java Thu Oct  3 16:10:32 2013
@@ -18,552 +18,106 @@
 
 package org.apache.hadoop.hive.ql.exec.vector.expressions;
 
-import static org.apache.hadoop.hive.ql.udf.UDFLike.likePatternToRegExp;
+import org.apache.hadoop.hive.ql.udf.UDFLike;
+import org.apache.hadoop.io.Text;
 
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.io.Text;
-
 /**
  * Evaluate LIKE filter on a batch for a vector of strings.
  */
-public class FilterStringColLikeStringScalar extends VectorExpression {
+public class FilterStringColLikeStringScalar extends AbstractFilterStringColLikeStringScalar {
   private static final long serialVersionUID = 1L;
-  private int colNum;
-  private Pattern compiledPattern;
-  private PatternType type = PatternType.NONE;
-  private String simpleStringPattern;
-  private final Text simplePattern = new Text();
-
-  private transient ByteBuffer byteBuffer;
-  private transient CharBuffer charBuffer;
-  private transient final CharsetDecoder decoder;
-
-  // Doing characters comparison directly instead of regular expression
-  // matching for simple patterns like "%abc%".
-  public enum PatternType {
-    NONE, // "abc"
-    BEGIN, // "abc%"
-    END, // "%abc"
-    MIDDLE, // "%abc%"
-    COMPLEX, // all other cases, such as "ab%c_de"
-  }
 
-  public FilterStringColLikeStringScalar(int colNum, Text likePattern) {
-    this();
-    this.colNum = colNum;
-    String stringLikePattern = likePattern.toString();
-    parseSimplePattern(stringLikePattern);
-    if (type == PatternType.COMPLEX) {
-      compiledPattern = Pattern.compile(likePatternToRegExp(stringLikePattern));
-    }
-  }
+  private transient final static List<CheckerFactory> checkerFactories = Arrays.asList(
+      new BeginCheckerFactory(),
+      new EndCheckerFactory(),
+      new MiddleCheckerFactory(),
+      new NoneCheckerFactory(),
+      new ComplexCheckerFactory());
 
   public FilterStringColLikeStringScalar() {
     super();
-    decoder = Charset.forName("UTF-8").newDecoder()
-        .onMalformedInput(CodingErrorAction.REPLACE)
-        .onUnmappableCharacter(CodingErrorAction.REPLACE);
-    byteBuffer = ByteBuffer.allocate(4);
-    charBuffer = CharBuffer.allocate(4);
   }
 
-  public PatternType getType() {
-    return type;
+  public FilterStringColLikeStringScalar(int colNum, Text likePattern) {
+    super(colNum, likePattern.toString());
   }
 
-  private boolean like(byte[] bytes, int start, int len) {
-    switch (type) {
-      case NONE:
-        return noneLike(bytes, start, len, simplePattern.getBytes());
-      case BEGIN:
-        return beginLike(bytes, start, len, simplePattern.getBytes());
-      case END:
-        return endLike(bytes, start, len, simplePattern.getBytes());
-      case MIDDLE:
-        return midLike(bytes, start, len, simplePattern.getBytes());
-      case COMPLEX:
-        return complexLike(bytes, start, len);
-      default:
-        return false;
-    }
+  @Override
+  protected List<CheckerFactory> getCheckerFactories() {
+    return checkerFactories;
   }
 
-  private static boolean noneLike(byte[] byteS, int start, int len, byte[] byteSub) {
-    int lenSub = byteSub.length;
-    if (len != lenSub) {
-      return false;
-    }
-    for (int i = start, j = 0; j < len; i++, j++) {
-      if (byteS[i] != byteSub[j]) {
-        return false;
-      }
-    }
-    return true;
-  }
+  /**
+   * Accepts simple LIKE patterns like "abc%" and creates corresponding checkers.
+   */
+  private static class BeginCheckerFactory implements CheckerFactory {
+    private static final Pattern BEGIN_PATTERN = Pattern.compile("([^_%]+)%");
 
-  private static boolean beginLike(byte[] byteS, int start, int len, byte[] byteSub) {
-    if (len < byteSub.length) {
-      return false;
-    }
-    for (int i = start, j = 0; j < byteSub.length; i++, j++) {
-      if (byteS[i] != byteSub[j]) {
-        return false;
+    public Checker tryCreate(String pattern) {
+      Matcher matcher = BEGIN_PATTERN.matcher(pattern);
+      if (matcher.matches()) {
+        return new BeginChecker(matcher.group(1));
       }
+      return null;
     }
-    return true;
   }
 
-  private static boolean endLike(byte[] byteS, int start, int len, byte[] byteSub) {
-    int lenSub = byteSub.length;
-    if (len < lenSub) {
-      return false;
-    }
-    for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) {
-      if (byteS[i] != byteSub[j]) {
-        return false;
-      }
-    }
-    return true;
-  }
+  /**
+   * Accepts simple LIKE patterns like "%abc" and creates a corresponding checkers.
+   */
+  private static class EndCheckerFactory implements CheckerFactory {
+    private static final Pattern END_PATTERN = Pattern.compile("%([^_%]+)");
 
-  private static boolean midLike(byte[] byteS, int start, int len, byte[] byteSub) {
-    int lenSub = byteSub.length;
-    if (len < lenSub) {
-      return false;
-    }
-    int end = start + len - lenSub + 1;
-    boolean match = false;
-    for (int i = start; (i < end) && (!match); i++) {
-      match = true;
-      for (int j = 0; j < lenSub; j++) {
-        if (byteS[i + j] != byteSub[j]) {
-          match = false;
-          break;
-        }
+    public Checker tryCreate(String pattern) {
+      Matcher matcher = END_PATTERN.matcher(pattern);
+      if (matcher.matches()) {
+        return new EndChecker(matcher.group(1));
       }
+      return null;
     }
-    return match;
   }
 
   /**
-   * Matches the byte array against the complex like pattern. This method uses
-   * {@link #compiledPattern} to match. For decoding performance, it caches
-   * {@link #compiledPattern}, {@link #byteBuffer} and {@link #charBuffer}.
-   * When the length to decode is greater than the capacity of
-   * {@link #byteBuffer}, it creates new {@link #byteBuffer} and
-   * {@link #charBuffer}. The capacity of the new {@link #byteBuffer} is the
-   * double of the length, for fewer object creations and higher memory
-   * utilization.
-   *
-   * @param byteS
-   *          A byte array that contains a UTF-8 string.
-   * @param start
-   *          A position to start decoding.
-   * @param len
-   *          A length to decode.
-   * @return
-   *          true if the byte array matches the complex like pattern,
-   *          otherwise false.
+   * Accepts simple LIKE patterns like "%abc%" and creates a corresponding checkers.
    */
-  private boolean complexLike(byte[] byteS, int start, int len) {
-    // Prepare buffers
-    if (byteBuffer.capacity() < len) {
-      byteBuffer = ByteBuffer.allocate(len * 2);
-    }
-    byteBuffer.clear();
-    byteBuffer.put(byteS, start, len);
-    byteBuffer.flip();
-
-    int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte());
-    if (charBuffer.capacity() < maxChars) {
-      charBuffer = CharBuffer.allocate(maxChars);
-    }
-    charBuffer.clear();
-
-    // Decode UTF-8
-    decoder.reset();
-    decoder.decode(byteBuffer, charBuffer, true);
-    decoder.flush(charBuffer);
-    charBuffer.flip();
-
-    // Match the given bytes with the like pattern
-    return compiledPattern.matcher(charBuffer).matches();
-  }
+  private static class MiddleCheckerFactory implements CheckerFactory {
+    private static final Pattern MIDDLE_PATTERN = Pattern.compile("%([^_%]+)%");
 
-  /**
-   * Parses the likePattern. Based on it is a simple pattern or not, the
-   * function might change two member variables. {@link #type} will be changed
-   * to the corresponding pattern type; {@link #simplePattern} will record the
-   * string in it for later pattern matching if it is a simple pattern.
-   * <p>
-   * Examples: <blockquote>
-   *
-   * <pre>
-   * parseSimplePattern("%abc%") changes {@link #type} to PatternType.MIDDLE
-   * and changes {@link #simplePattern} to "abc"
-   * parseSimplePattern("%ab_c%") changes {@link #type} to PatternType.COMPLEX
-   * and does not change {@link #simplePattern}
-   * </pre>
-   *
-   * </blockquote>
-   *
-   * @param likePattern
-   *          the input LIKE query pattern
-   */
-  private void parseSimplePattern(String likePattern) {
-    int length = likePattern.length();
-    int beginIndex = 0;
-    int endIndex = length;
-    char lastChar = 'a';
-    String strPattern = new String();
-    type = PatternType.NONE;
-
-    for (int i = 0; i < length; i++) {
-      char n = likePattern.charAt(i);
-      if (n == '_') { // such as "a_b"
-        if (lastChar != '\\') { // such as "a%bc"
-          type = PatternType.COMPLEX;
-          return;
-        } else { // such as "abc\%de%"
-          strPattern += likePattern.substring(beginIndex, i - 1);
-          beginIndex = i;
-        }
-      } else if (n == '%') {
-        if (i == 0) { // such as "%abc"
-          type = PatternType.END;
-          beginIndex = 1;
-        } else if (i < length - 1) {
-          if (lastChar != '\\') { // such as "a%bc"
-            type = PatternType.COMPLEX;
-            return;
-          } else { // such as "abc\%de%"
-            strPattern += likePattern.substring(beginIndex, i - 1);
-            beginIndex = i;
-          }
-        } else {
-          if (lastChar != '\\') {
-            endIndex = length - 1;
-            if (type == PatternType.END) { // such as "%abc%"
-              type = PatternType.MIDDLE;
-            } else {
-              type = PatternType.BEGIN; // such as "abc%"
-            }
-          } else { // such as "abc\%"
-            strPattern += likePattern.substring(beginIndex, i - 1);
-            beginIndex = i;
-            endIndex = length;
-          }
-        }
+    public Checker tryCreate(String pattern) {
+      Matcher matcher = MIDDLE_PATTERN.matcher(pattern);
+      if (matcher.matches()) {
+        return new MiddleChecker(matcher.group(1));
       }
-      lastChar = n;
+      return null;
     }
-
-    strPattern += likePattern.substring(beginIndex, endIndex);
-    simpleStringPattern = strPattern;
-    simplePattern.set(simpleStringPattern);
   }
 
-  @Override
-  public void evaluate(VectorizedRowBatch batch) {
-
-    if (childExpressions != null) {
-      super.evaluateChildren(batch);
-    }
-
-    BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
-    int[] sel = batch.selected;
-    boolean[] nullPos = inputColVector.isNull;
-    int n = batch.size;
-    byte[][] vector = inputColVector.vector;
-    int[] length = inputColVector.length;
-    int[] start = inputColVector.start;
-    byte[] simplePatternBytes = simplePattern.getBytes();
-
-    // return immediately if batch is empty
-    if (n == 0) {
-      return;
-    }
-
-    if (inputColVector.noNulls) {
-      if (inputColVector.isRepeating) {
-
-        // All must be selected otherwise size would be zero Repeating property will not change.
-        if (!like(vector[0], start[0], length[0])) {
-
-          // Entire batch is filtered out.
-          batch.size = 0;
-        }
-      } else if (batch.selectedInUse) {
-        int newSize = 0;
-
-        switch (type) {
-          case NONE:
-            for (int j = 0; j != n; j++) {
-              int i = sel[j];
-              if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                sel[newSize++] = i;
-              }
-            }
-            break;
-          case BEGIN:
-            for (int j = 0; j != n; j++) {
-              int i = sel[j];
-              if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                sel[newSize++] = i;
-              }
-            }
-            break;
-          case END:
-            for (int j = 0; j != n; j++) {
-              int i = sel[j];
-              if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                sel[newSize++] = i;
-              }
-            }
-            break;
-          case MIDDLE:
-            for (int j = 0; j != n; j++) {
-              int i = sel[j];
-              if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                sel[newSize++] = i;
-              }
-            }
-            break;
-          case COMPLEX:
-            for (int j = 0; j != n; j++) {
-              int i = sel[j];
-              if (complexLike(vector[i], start[i], length[i])) {
-                sel[newSize++] = i;
-              }
-            }
-            break;
-        }
-
-        batch.size = newSize;
-      } else {
-        int newSize = 0;
-
-        switch (type) {
-          case NONE:
-            for (int i = 0; i != n; i++) {
-              if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                sel[newSize++] = i;
-              }
-            }
-            break;
-          case BEGIN:
-            for (int i = 0; i != n; i++) {
-              if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                sel[newSize++] = i;
-              }
-            }
-            break;
-          case END:
-            for (int i = 0; i != n; i++) {
-              if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                sel[newSize++] = i;
-              }
-            }
-            break;
-          case MIDDLE:
-            for (int i = 0; i != n; i++) {
-              if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                sel[newSize++] = i;
-              }
-            }
-            break;
-          case COMPLEX:
-            for (int i = 0; i != n; i++) {
-              if (complexLike(vector[i], start[i], length[i])) {
-                sel[newSize++] = i;
-              }
-            }
-            break;
-        }
-
-        if (newSize < n) {
-          batch.size = newSize;
-          batch.selectedInUse = true;
-        }
-      }
-    } else {
-      if (inputColVector.isRepeating) {
+  /**
+   * Accepts simple LIKE patterns like "abc" and creates corresponding checkers.
+   */
+  private static class NoneCheckerFactory implements CheckerFactory {
+    private static final Pattern NONE_PATTERN = Pattern.compile("[^%_]+");
 
-        //All must be selected otherwise size would be zero. Repeating property will not change.
-        if (!nullPos[0]) {
-          if (!like(vector[0], start[0], length[0])) {
-
-            //Entire batch is filtered out.
-            batch.size = 0;
-          }
-        } else {
-          batch.size = 0;
-        }
-      } else if (batch.selectedInUse) {
-        int newSize = 0;
-
-        switch (type) {
-          case NONE:
-            for (int j = 0; j != n; j++) {
-              int i = sel[j];
-              if (!nullPos[i]) {
-                if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                  sel[newSize++] = i;
-                }
-              }
-            }
-            break;
-          case BEGIN:
-            for (int j = 0; j != n; j++) {
-              int i = sel[j];
-              if (!nullPos[i]) {
-                if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                  sel[newSize++] = i;
-                }
-              }
-            }
-            break;
-          case END:
-            for (int j = 0; j != n; j++) {
-              int i = sel[j];
-              if (!nullPos[i]) {
-                if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                  sel[newSize++] = i;
-                }
-              }
-            }
-            break;
-          case MIDDLE:
-            for (int j = 0; j != n; j++) {
-              int i = sel[j];
-              if (!nullPos[i]) {
-                if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                  sel[newSize++] = i;
-                }
-              }
-            }
-            break;
-          case COMPLEX:
-            for (int j = 0; j != n; j++) {
-              int i = sel[j];
-              if (!nullPos[i]) {
-                if (complexLike(vector[i], start[i], length[i])) {
-                  sel[newSize++] = i;
-                }
-              }
-            }
-            break;
-        }
-
-        //Change the selected vector
-        batch.size = newSize;
-      } else {
-        int newSize = 0;
-
-        switch (type) {
-          case NONE:
-            for (int i = 0; i != n; i++) {
-              if (!nullPos[i]) {
-                if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                  sel[newSize++] = i;
-                }
-              }
-            }
-            break;
-          case BEGIN:
-            for (int i = 0; i != n; i++) {
-              if (!nullPos[i]) {
-                if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                  sel[newSize++] = i;
-                }
-              }
-            }
-            break;
-          case END:
-            for (int i = 0; i != n; i++) {
-              if (!nullPos[i]) {
-                if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                  sel[newSize++] = i;
-                }
-              }
-            }
-            break;
-          case MIDDLE:
-            for (int i = 0; i != n; i++) {
-              if (!nullPos[i]) {
-                if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
-                  sel[newSize++] = i;
-                }
-              }
-            }
-            break;
-          case COMPLEX:
-            for (int i = 0; i != n; i++) {
-              if (!nullPos[i]) {
-                if (complexLike(vector[i], start[i], length[i])) {
-                  sel[newSize++] = i;
-                }
-              }
-            }
-            break;
-        }
-
-        if (newSize < n) {
-          batch.size = newSize;
-          batch.selectedInUse = true;
-        }
-
-        /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline
-         * future operations. So selectedInUse will remain false.
-         */
+    public Checker tryCreate(String pattern) {
+      Matcher matcher = NONE_PATTERN.matcher(pattern);
+      if (matcher.matches()) {
+        return new NoneChecker(pattern);
       }
+      return null;
     }
   }
 
-  @Override
-  public int getOutputColumn() {
-    return -1;
-  }
-
-  @Override
-  public String getOutputType() {
-    return "boolean";
-  }
-
-  public int getColNum() {
-    return colNum;
-  }
-
-  public void setColNum(int colNum) {
-    this.colNum = colNum;
-  }
-
-  public Pattern getCompiledPattern() {
-    return compiledPattern;
-  }
-
-  public void setCompiledPattern(Pattern compiledPattern) {
-    this.compiledPattern = compiledPattern;
-  }
-
-  public void setType(PatternType type) {
-    this.type = type;
-  }
-
-  public String getSimpleStringPattern() {
-    return simpleStringPattern;
-  }
-
-  public void setSimpleStringPattern(String simpleStringPattern) {
-    this.simpleStringPattern = simpleStringPattern;
-    simplePattern.set(simpleStringPattern);
+  /**
+   * Accepts any LIKE patterns and creates corresponding checkers.
+   */
+  private static class ComplexCheckerFactory implements CheckerFactory {
+    public Checker tryCreate(String pattern) {
+      return new ComplexChecker(UDFLike.likePatternToRegExp(pattern));
+    }
   }
 }

Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java?rev=1528917&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java Thu Oct  3 16:10:32 2013
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.io.Text;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Evaluate REGEXP filter on a batch for a vector of strings.
+ */
+public class FilterStringColRegExpStringScalar extends AbstractFilterStringColLikeStringScalar {
+  private static final long serialVersionUID = 1L;
+
+  private static final String LITERAL_CHAR = "[^\\[\\]\\\\(){}*?+|$^.]";
+  private static final String LITERAL_CHAR_GROUP = "(" + LITERAL_CHAR + "+)";
+
+  private transient static List<CheckerFactory> checkerFactories = Arrays.asList(
+      new BeginCheckerFactory(),
+      new EndCheckerFactory(),
+      new MiddleCheckerFactory(),
+      new PhoneNumberCheckerFactory(),
+      new NoneCheckerFactory(),
+      new ComplexCheckerFactory());
+
+  public FilterStringColRegExpStringScalar() {
+    super();
+  }
+
+  public FilterStringColRegExpStringScalar(int colNum, Text regExpPattern) {
+    super(colNum, regExpPattern.toString());
+  }
+
+  @Override
+  protected List<CheckerFactory> getCheckerFactories() {
+    return checkerFactories;
+  }
+
+  /**
+   * Accepts simple REGEXP patterns like "abc.*" and creates corresponding checkers.
+   */
+  private static class BeginCheckerFactory implements CheckerFactory {
+    private static final Pattern BEGIN_PATTERN = Pattern.compile(LITERAL_CHAR_GROUP + "\\.\\*");
+
+    public Checker tryCreate(String pattern) {
+      Matcher matcher = BEGIN_PATTERN.matcher(pattern);
+      if (matcher.matches()) {
+        return new BeginChecker(matcher.group(1));
+      }
+      return null;
+    }
+  }
+
+  /**
+   * Accepts simple REGEXP patterns like ".*abc" and creates corresponding checkers.
+   */
+  private static class EndCheckerFactory implements CheckerFactory {
+    private static final Pattern END_PATTERN = Pattern.compile("\\.\\*" + LITERAL_CHAR_GROUP);
+
+    public Checker tryCreate(String pattern) {
+      Matcher matcher = END_PATTERN.matcher(pattern);
+      if (matcher.matches()) {
+        return new EndChecker(matcher.group(1));
+      }
+      return null;
+    }
+  }
+
+  /**
+   * Accepts simple REGEXP patterns like ".*abc.*" and creates corresponding checkers.
+   */
+  private static class MiddleCheckerFactory implements CheckerFactory {
+    private static final Pattern MIDDLE_PATTERN = Pattern.compile("\\.\\*" + LITERAL_CHAR_GROUP + "\\.\\*");
+
+    public Checker tryCreate(String pattern) {
+      Matcher matcher = MIDDLE_PATTERN.matcher(pattern);
+      if (matcher.matches()) {
+        return new MiddleChecker(matcher.group(1));
+      }
+      return null;
+    }
+  }
+
+  /**
+   * Accepts simple phone number regular expressions consisted only with "\(", "\)", "-", " ", "\d".
+   * For example, it accepts "(\d\d\d) \d\d\d-\d\d\d\d" then matches "(012) 345-6789".
+   */
+  private static class PhoneNumberChecker implements Checker {
+    byte[] byteSub;
+
+    PhoneNumberChecker(String pattern) {
+      this.byteSub = pattern.getBytes();
+    }
+
+    public boolean check(byte[] byteS, int start, int len) {
+      for (int i = 0; i < len; i++) {
+        byte c = byteS[start + i];
+        byte p = byteSub[i];
+        switch (p) {
+          // For pattern 'd', find digits.
+          case 'd':
+            if (!('0' <= c && c <= '9')) {
+              return false;
+            }
+            break;
+          
+          // For other registered patterns, find exact matches.
+          case '-':
+          case ' ':
+          case '(':
+          case ')':
+            if (c != p) {
+              return false;
+            }
+            break;
+          
+          // For unregistered patterns, fail.
+          default:
+            return false;
+        }
+      }
+      return true;
+    }
+  }
+
+  /**
+   * Accepts phone number REGEXP patterns like "\(\d\d\d\) \d\d\d-\d\d\d\d" and creates
+   * corresponding checkers.
+   */
+  private static class PhoneNumberCheckerFactory implements CheckerFactory {
+    public Checker tryCreate(String pattern) {
+      if (pattern.matches("(\\\\d|\\\\\\(|\\\\\\)|-| )+")) {
+        return new PhoneNumberChecker(pattern.replaceAll("\\\\d", "d").replaceAll("\\\\\\(", "(").replaceAll("\\\\\\)", ")"));
+      }
+      return null;
+    }
+  }
+
+  /**
+   * Accepts simple REGEXP patterns like "abc" and creates corresponding checkers.
+   */
+  private static class NoneCheckerFactory implements CheckerFactory {
+    private static final Pattern NONE_PATTERN = Pattern.compile(LITERAL_CHAR_GROUP);
+
+    public Checker tryCreate(String pattern) {
+      Matcher matcher = NONE_PATTERN.matcher(pattern);
+      if (matcher.matches()) {
+        return new NoneChecker(matcher.group(1));
+      }
+      return null;
+    }
+  }
+
+  /**
+   * Accepts any REGEXP patterns and creates corresponding checkers.
+   */
+  private static class ComplexCheckerFactory implements CheckerFactory {
+    public Checker tryCreate(String pattern) {
+      return new ComplexChecker(pattern);
+    }
+  }
+}

Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java?rev=1528917&r1=1528916&r2=1528917&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java Thu Oct  3 16:10:32 2013
@@ -670,28 +670,28 @@ public class TestVectorStringExpressions
 
     // BEGIN pattern
     expr = new FilterStringColLikeStringScalar(0, new Text("abc%"));
-    Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.BEGIN,
-        expr.getType());
+    Assert.assertEquals(FilterStringColLikeStringScalar.BeginChecker.class,
+        expr.checker.getClass());
 
     // END pattern
     expr = new FilterStringColLikeStringScalar(0, new Text("%abc"));
-    Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.END,
-        expr.getType());
+    Assert.assertEquals(FilterStringColLikeStringScalar.EndChecker.class,
+        expr.checker.getClass());
 
     // MIDDLE pattern
     expr = new FilterStringColLikeStringScalar(0, new Text("%abc%"));
-    Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.MIDDLE,
-        expr.getType());
+    Assert.assertEquals(FilterStringColLikeStringScalar.MiddleChecker.class,
+        expr.checker.getClass());
 
     // COMPLEX pattern
     expr = new FilterStringColLikeStringScalar(0, new Text("%abc%de"));
-    Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.COMPLEX,
-        expr.getType());
+    Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+        expr.checker.getClass());
 
     // NONE pattern
     expr = new FilterStringColLikeStringScalar(0, new Text("abc"));
-    Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.NONE,
-        expr.getType());
+    Assert.assertEquals(FilterStringColLikeStringScalar.NoneChecker.class,
+        expr.checker.getClass());
   }
 
   public void testStringLikeMultiByte() {