You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ha...@apache.org on 2013/10/03 18:10:32 UTC
svn commit: r1528917 - in /hive/trunk/ql/src:
java/org/apache/hadoop/hive/ql/exec/vector/expressions/
test/org/apache/hadoop/hive/ql/exec/vector/expressions/
Author: hashutosh
Date: Thu Oct 3 16:10:32 2013
New Revision: 1528917
URL: http://svn.apache.org/r1528917
Log:
HIVE-4642 : Implement vectorized RLIKE and REGEXP filter expressions (Teddy Choi via Ashutosh Chauhan)
Added:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java?rev=1528917&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/AbstractFilterStringColLikeStringScalar.java Thu Oct 3 16:10:32 2013
@@ -0,0 +1,412 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
+import org.apache.hadoop.io.Text;
+
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * An abstract class for LIKE and REGEXP expressions. LIKE and REGEXP expression share similar
+ * functions, but they have different grammars. AbstractFilterStringColLikeStringScalar class
+ * provides shared classes and methods. Each subclass handles its grammar.
+ */
+public abstract class AbstractFilterStringColLikeStringScalar extends VectorExpression {
+ private static final long serialVersionUID = 1L;
+
+ private int colNum;
+ private String pattern;
+ transient Checker checker;
+
+ public AbstractFilterStringColLikeStringScalar() {
+ super();
+ }
+
+ public AbstractFilterStringColLikeStringScalar(int colNum, String pattern) {
+ this.colNum = colNum;
+ this.pattern = pattern;
+ }
+
+ protected abstract List<CheckerFactory> getCheckerFactories();
+
+ /**
+ * Selects an optimized checker for a given string.
+ * @param pattern
+ * @return
+ */
+ private Checker createChecker(String pattern) {
+ for (CheckerFactory checkerFactory : getCheckerFactories()) {
+ Checker checker = checkerFactory.tryCreate(pattern);
+ if (checker != null) {
+ return checker;
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public void evaluate(VectorizedRowBatch batch) {
+ this.checker = createChecker(pattern);
+
+ if (childExpressions != null) {
+ super.evaluateChildren(batch);
+ }
+
+ BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
+ int[] sel = batch.selected;
+ boolean[] nullPos = inputColVector.isNull;
+ int n = batch.size;
+ byte[][] vector = inputColVector.vector;
+ int[] length = inputColVector.length;
+ int[] start = inputColVector.start;
+
+ // return immediately if batch is empty
+ if (n == 0) {
+ return;
+ }
+
+ if (inputColVector.noNulls) {
+ if (inputColVector.isRepeating) {
+
+ // All must be selected otherwise size would be zero Repeating property will not change.
+ if (!checker.check(vector[0], start[0], length[0])) {
+
+ // Entire batch is filtered out.
+ batch.size = 0;
+ }
+ } else if (batch.selectedInUse) {
+ int newSize = 0;
+
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (checker.check(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+
+ batch.size = newSize;
+ } else {
+ int newSize = 0;
+ for (int i = 0; i != n; i++) {
+ if (checker.check(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+
+ if (newSize < n) {
+ batch.size = newSize;
+ batch.selectedInUse = true;
+ }
+ }
+ } else {
+ if (inputColVector.isRepeating) {
+
+ //All must be selected otherwise size would be zero. Repeating property will not change.
+ if (!nullPos[0]) {
+ if (!checker.check(vector[0], start[0], length[0])) {
+
+ //Entire batch is filtered out.
+ batch.size = 0;
+ }
+ } else {
+ batch.size = 0;
+ }
+ } else if (batch.selectedInUse) {
+ int newSize = 0;
+
+ for (int j = 0; j != n; j++) {
+ int i = sel[j];
+ if (!nullPos[i]) {
+ if (checker.check(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+
+ //Change the selected vector
+ batch.size = newSize;
+ } else {
+ int newSize = 0;
+
+ for (int i = 0; i != n; i++) {
+ if (!nullPos[i]) {
+ if (checker.check(vector[i], start[i], length[i])) {
+ sel[newSize++] = i;
+ }
+ }
+ }
+
+ if (newSize < n) {
+ batch.size = newSize;
+ batch.selectedInUse = true;
+ }
+
+ /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline
+ * future operations. So selectedInUse will remain false.
+ */
+ }
+ }
+ }
+
+ @Override
+ public int getOutputColumn() {
+ return -1;
+ }
+
+ @Override
+ public String getOutputType() {
+ return "boolean";
+ }
+
+ /**
+ * A Checker contains a pattern and checks whether a given string matches or not.
+ */
+ public interface Checker {
+ /**
+ * Checks whether the given string matches with its pattern.
+ * @param byteS The byte array that contains the string
+ * @param start The start position of the string
+ * @param len The length of the string
+ * @return Whether it matches or not.
+ */
+ boolean check(byte[] byteS, int start, int len);
+ }
+
+ /**
+ * A CheckerFactory creates checkers of its kind.
+ */
+ protected interface CheckerFactory {
+ /**
+ * If the given pattern is acceptable for its checker class, it creates and returns a checker.
+ * Otherwise, it returns <code>null</code>.
+ * @param pattern
+ * @return If the pattern is acceptable, a <code>Checker</code> object. Otherwise
+ * <code>null</code>.
+ */
+ Checker tryCreate(String pattern);
+ }
+
+ /**
+ * Matches the whole string to its pattern.
+ */
+ protected static class NoneChecker implements Checker {
+ byte [] byteSub;
+
+ NoneChecker(String pattern) {
+ try {
+ byteSub = pattern.getBytes("UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public boolean check(byte[] byteS, int start, int len) {
+ int lenSub = byteSub.length;
+ if (len != lenSub) {
+ return false;
+ }
+ for (int i = start, j = 0; j < len; i++, j++) {
+ if (byteS[i] != byteSub[j]) {
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+
+ /**
+ * Matches the beginning of each string to a pattern.
+ */
+ protected static class BeginChecker implements Checker {
+ byte[] byteSub;
+
+ BeginChecker(String pattern) {
+ try {
+ byteSub = pattern.getBytes("UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public boolean check(byte[] byteS, int start, int len) {
+ if (len < byteSub.length) {
+ return false;
+ }
+ for (int i = start, j = 0; j < byteSub.length; i++, j++) {
+ if (byteS[i] != byteSub[j]) {
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+
+ /**
+ * Matches the ending of each string to its pattern.
+ */
+ protected static class EndChecker implements Checker {
+ byte[] byteSub;
+ EndChecker(String pattern) {
+ try {
+ byteSub = pattern.getBytes("UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public boolean check(byte[] byteS, int start, int len) {
+ int lenSub = byteSub.length;
+ if (len < lenSub) {
+ return false;
+ }
+ for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) {
+ if (byteS[i] != byteSub[j]) {
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+
+ /**
+ * Matches the middle of each string to its pattern.
+ */
+ protected static class MiddleChecker implements Checker {
+ byte[] byteSub;
+ int lenSub;
+
+ MiddleChecker(String pattern) {
+ try {
+ byteSub = pattern.getBytes("UTF-8");
+ lenSub = byteSub.length;
+ } catch (UnsupportedEncodingException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public boolean check(byte[] byteS, int start, int len) {
+ if (len < lenSub) {
+ return false;
+ }
+ int end = start + len - lenSub + 1;
+ boolean match = false;
+ for (int i = start; i < end; i++) {
+ match = true;
+ for (int j = 0; j < lenSub; j++) {
+ if (byteS[i + j] != byteSub[j]) {
+ match = false;
+ break;
+ }
+ }
+ if (match) {
+ return true;
+ }
+ }
+ return match;
+ }
+ }
+
+ /**
+ * Matches each string to a pattern with Java regular expression package.
+ */
+ protected static class ComplexChecker implements Checker {
+ Pattern compiledPattern;
+ Matcher matcher;
+ FastUTF8Decoder decoder;
+
+ ComplexChecker(String pattern) {
+ compiledPattern = Pattern.compile(pattern);
+ matcher = compiledPattern.matcher("");
+ decoder = new FastUTF8Decoder();
+ }
+
+ public boolean check(byte[] byteS, int start, int len) {
+ // Match the given bytes with the like pattern
+ matcher.reset(decoder.decodeUnsafely(byteS, start, len));
+ return matcher.matches();
+ }
+ }
+
+ /**
+ * A fast UTF-8 decoder that caches necessary objects for decoding.
+ */
+ private static class FastUTF8Decoder {
+ CharsetDecoder decoder;
+ ByteBuffer byteBuffer;
+ CharBuffer charBuffer;
+
+ public FastUTF8Decoder() {
+ decoder = Charset.forName("UTF-8").newDecoder()
+ .onMalformedInput(CodingErrorAction.REPLACE)
+ .onUnmappableCharacter(CodingErrorAction.REPLACE);
+ byteBuffer = ByteBuffer.allocate(4);
+ charBuffer = CharBuffer.allocate(4);
+ }
+
+ public CharBuffer decodeUnsafely(byte[] byteS, int start, int len) {
+ // Prepare buffers
+ if (byteBuffer.capacity() < len) {
+ byteBuffer = ByteBuffer.allocate(len * 2);
+ }
+ byteBuffer.clear();
+ byteBuffer.put(byteS, start, len);
+ byteBuffer.flip();
+
+ int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte());
+ if (charBuffer.capacity() < maxChars) {
+ charBuffer = CharBuffer.allocate(maxChars);
+ }
+ charBuffer.clear();
+
+ // Decode UTF-8
+ decoder.reset();
+ decoder.decode(byteBuffer, charBuffer, true);
+ decoder.flush(charBuffer);
+ charBuffer.flip();
+
+ return charBuffer;
+ }
+ }
+
+ public int getColNum() {
+ return colNum;
+ }
+
+ public void setColNum(int colNum) {
+ this.colNum = colNum;
+ }
+
+ public String getPattern() {
+ return pattern;
+ }
+
+ public void setPattern(String pattern) {
+ this.pattern = pattern;
+ }
+}
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java?rev=1528917&r1=1528916&r2=1528917&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColLikeStringScalar.java Thu Oct 3 16:10:32 2013
@@ -18,552 +18,106 @@
package org.apache.hadoop.hive.ql.exec.vector.expressions;
-import static org.apache.hadoop.hive.ql.udf.UDFLike.likePatternToRegExp;
+import org.apache.hadoop.hive.ql.udf.UDFLike;
+import org.apache.hadoop.io.Text;
-import java.nio.ByteBuffer;
-import java.nio.CharBuffer;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
-import org.apache.hadoop.io.Text;
-
/**
* Evaluate LIKE filter on a batch for a vector of strings.
*/
-public class FilterStringColLikeStringScalar extends VectorExpression {
+public class FilterStringColLikeStringScalar extends AbstractFilterStringColLikeStringScalar {
private static final long serialVersionUID = 1L;
- private int colNum;
- private Pattern compiledPattern;
- private PatternType type = PatternType.NONE;
- private String simpleStringPattern;
- private final Text simplePattern = new Text();
-
- private transient ByteBuffer byteBuffer;
- private transient CharBuffer charBuffer;
- private transient final CharsetDecoder decoder;
-
- // Doing characters comparison directly instead of regular expression
- // matching for simple patterns like "%abc%".
- public enum PatternType {
- NONE, // "abc"
- BEGIN, // "abc%"
- END, // "%abc"
- MIDDLE, // "%abc%"
- COMPLEX, // all other cases, such as "ab%c_de"
- }
- public FilterStringColLikeStringScalar(int colNum, Text likePattern) {
- this();
- this.colNum = colNum;
- String stringLikePattern = likePattern.toString();
- parseSimplePattern(stringLikePattern);
- if (type == PatternType.COMPLEX) {
- compiledPattern = Pattern.compile(likePatternToRegExp(stringLikePattern));
- }
- }
+ private transient final static List<CheckerFactory> checkerFactories = Arrays.asList(
+ new BeginCheckerFactory(),
+ new EndCheckerFactory(),
+ new MiddleCheckerFactory(),
+ new NoneCheckerFactory(),
+ new ComplexCheckerFactory());
public FilterStringColLikeStringScalar() {
super();
- decoder = Charset.forName("UTF-8").newDecoder()
- .onMalformedInput(CodingErrorAction.REPLACE)
- .onUnmappableCharacter(CodingErrorAction.REPLACE);
- byteBuffer = ByteBuffer.allocate(4);
- charBuffer = CharBuffer.allocate(4);
}
- public PatternType getType() {
- return type;
+ public FilterStringColLikeStringScalar(int colNum, Text likePattern) {
+ super(colNum, likePattern.toString());
}
- private boolean like(byte[] bytes, int start, int len) {
- switch (type) {
- case NONE:
- return noneLike(bytes, start, len, simplePattern.getBytes());
- case BEGIN:
- return beginLike(bytes, start, len, simplePattern.getBytes());
- case END:
- return endLike(bytes, start, len, simplePattern.getBytes());
- case MIDDLE:
- return midLike(bytes, start, len, simplePattern.getBytes());
- case COMPLEX:
- return complexLike(bytes, start, len);
- default:
- return false;
- }
+ @Override
+ protected List<CheckerFactory> getCheckerFactories() {
+ return checkerFactories;
}
- private static boolean noneLike(byte[] byteS, int start, int len, byte[] byteSub) {
- int lenSub = byteSub.length;
- if (len != lenSub) {
- return false;
- }
- for (int i = start, j = 0; j < len; i++, j++) {
- if (byteS[i] != byteSub[j]) {
- return false;
- }
- }
- return true;
- }
+ /**
+ * Accepts simple LIKE patterns like "abc%" and creates corresponding checkers.
+ */
+ private static class BeginCheckerFactory implements CheckerFactory {
+ private static final Pattern BEGIN_PATTERN = Pattern.compile("([^_%]+)%");
- private static boolean beginLike(byte[] byteS, int start, int len, byte[] byteSub) {
- if (len < byteSub.length) {
- return false;
- }
- for (int i = start, j = 0; j < byteSub.length; i++, j++) {
- if (byteS[i] != byteSub[j]) {
- return false;
+ public Checker tryCreate(String pattern) {
+ Matcher matcher = BEGIN_PATTERN.matcher(pattern);
+ if (matcher.matches()) {
+ return new BeginChecker(matcher.group(1));
}
+ return null;
}
- return true;
}
- private static boolean endLike(byte[] byteS, int start, int len, byte[] byteSub) {
- int lenSub = byteSub.length;
- if (len < lenSub) {
- return false;
- }
- for (int i = start + len - lenSub, j = 0; j < lenSub; i++, j++) {
- if (byteS[i] != byteSub[j]) {
- return false;
- }
- }
- return true;
- }
+ /**
+ * Accepts simple LIKE patterns like "%abc" and creates a corresponding checkers.
+ */
+ private static class EndCheckerFactory implements CheckerFactory {
+ private static final Pattern END_PATTERN = Pattern.compile("%([^_%]+)");
- private static boolean midLike(byte[] byteS, int start, int len, byte[] byteSub) {
- int lenSub = byteSub.length;
- if (len < lenSub) {
- return false;
- }
- int end = start + len - lenSub + 1;
- boolean match = false;
- for (int i = start; (i < end) && (!match); i++) {
- match = true;
- for (int j = 0; j < lenSub; j++) {
- if (byteS[i + j] != byteSub[j]) {
- match = false;
- break;
- }
+ public Checker tryCreate(String pattern) {
+ Matcher matcher = END_PATTERN.matcher(pattern);
+ if (matcher.matches()) {
+ return new EndChecker(matcher.group(1));
}
+ return null;
}
- return match;
}
/**
- * Matches the byte array against the complex like pattern. This method uses
- * {@link #compiledPattern} to match. For decoding performance, it caches
- * {@link #compiledPattern}, {@link #byteBuffer} and {@link #charBuffer}.
- * When the length to decode is greater than the capacity of
- * {@link #byteBuffer}, it creates new {@link #byteBuffer} and
- * {@link #charBuffer}. The capacity of the new {@link #byteBuffer} is the
- * double of the length, for fewer object creations and higher memory
- * utilization.
- *
- * @param byteS
- * A byte array that contains a UTF-8 string.
- * @param start
- * A position to start decoding.
- * @param len
- * A length to decode.
- * @return
- * true if the byte array matches the complex like pattern,
- * otherwise false.
+ * Accepts simple LIKE patterns like "%abc%" and creates a corresponding checkers.
*/
- private boolean complexLike(byte[] byteS, int start, int len) {
- // Prepare buffers
- if (byteBuffer.capacity() < len) {
- byteBuffer = ByteBuffer.allocate(len * 2);
- }
- byteBuffer.clear();
- byteBuffer.put(byteS, start, len);
- byteBuffer.flip();
-
- int maxChars = (int) (byteBuffer.capacity() * decoder.maxCharsPerByte());
- if (charBuffer.capacity() < maxChars) {
- charBuffer = CharBuffer.allocate(maxChars);
- }
- charBuffer.clear();
-
- // Decode UTF-8
- decoder.reset();
- decoder.decode(byteBuffer, charBuffer, true);
- decoder.flush(charBuffer);
- charBuffer.flip();
-
- // Match the given bytes with the like pattern
- return compiledPattern.matcher(charBuffer).matches();
- }
+ private static class MiddleCheckerFactory implements CheckerFactory {
+ private static final Pattern MIDDLE_PATTERN = Pattern.compile("%([^_%]+)%");
- /**
- * Parses the likePattern. Based on it is a simple pattern or not, the
- * function might change two member variables. {@link #type} will be changed
- * to the corresponding pattern type; {@link #simplePattern} will record the
- * string in it for later pattern matching if it is a simple pattern.
- * <p>
- * Examples: <blockquote>
- *
- * <pre>
- * parseSimplePattern("%abc%") changes {@link #type} to PatternType.MIDDLE
- * and changes {@link #simplePattern} to "abc"
- * parseSimplePattern("%ab_c%") changes {@link #type} to PatternType.COMPLEX
- * and does not change {@link #simplePattern}
- * </pre>
- *
- * </blockquote>
- *
- * @param likePattern
- * the input LIKE query pattern
- */
- private void parseSimplePattern(String likePattern) {
- int length = likePattern.length();
- int beginIndex = 0;
- int endIndex = length;
- char lastChar = 'a';
- String strPattern = new String();
- type = PatternType.NONE;
-
- for (int i = 0; i < length; i++) {
- char n = likePattern.charAt(i);
- if (n == '_') { // such as "a_b"
- if (lastChar != '\\') { // such as "a%bc"
- type = PatternType.COMPLEX;
- return;
- } else { // such as "abc\%de%"
- strPattern += likePattern.substring(beginIndex, i - 1);
- beginIndex = i;
- }
- } else if (n == '%') {
- if (i == 0) { // such as "%abc"
- type = PatternType.END;
- beginIndex = 1;
- } else if (i < length - 1) {
- if (lastChar != '\\') { // such as "a%bc"
- type = PatternType.COMPLEX;
- return;
- } else { // such as "abc\%de%"
- strPattern += likePattern.substring(beginIndex, i - 1);
- beginIndex = i;
- }
- } else {
- if (lastChar != '\\') {
- endIndex = length - 1;
- if (type == PatternType.END) { // such as "%abc%"
- type = PatternType.MIDDLE;
- } else {
- type = PatternType.BEGIN; // such as "abc%"
- }
- } else { // such as "abc\%"
- strPattern += likePattern.substring(beginIndex, i - 1);
- beginIndex = i;
- endIndex = length;
- }
- }
+ public Checker tryCreate(String pattern) {
+ Matcher matcher = MIDDLE_PATTERN.matcher(pattern);
+ if (matcher.matches()) {
+ return new MiddleChecker(matcher.group(1));
}
- lastChar = n;
+ return null;
}
-
- strPattern += likePattern.substring(beginIndex, endIndex);
- simpleStringPattern = strPattern;
- simplePattern.set(simpleStringPattern);
}
- @Override
- public void evaluate(VectorizedRowBatch batch) {
-
- if (childExpressions != null) {
- super.evaluateChildren(batch);
- }
-
- BytesColumnVector inputColVector = (BytesColumnVector) batch.cols[colNum];
- int[] sel = batch.selected;
- boolean[] nullPos = inputColVector.isNull;
- int n = batch.size;
- byte[][] vector = inputColVector.vector;
- int[] length = inputColVector.length;
- int[] start = inputColVector.start;
- byte[] simplePatternBytes = simplePattern.getBytes();
-
- // return immediately if batch is empty
- if (n == 0) {
- return;
- }
-
- if (inputColVector.noNulls) {
- if (inputColVector.isRepeating) {
-
- // All must be selected otherwise size would be zero Repeating property will not change.
- if (!like(vector[0], start[0], length[0])) {
-
- // Entire batch is filtered out.
- batch.size = 0;
- }
- } else if (batch.selectedInUse) {
- int newSize = 0;
-
- switch (type) {
- case NONE:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case BEGIN:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case END:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case MIDDLE:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case COMPLEX:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (complexLike(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
- }
- }
- break;
- }
-
- batch.size = newSize;
- } else {
- int newSize = 0;
-
- switch (type) {
- case NONE:
- for (int i = 0; i != n; i++) {
- if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case BEGIN:
- for (int i = 0; i != n; i++) {
- if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case END:
- for (int i = 0; i != n; i++) {
- if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case MIDDLE:
- for (int i = 0; i != n; i++) {
- if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- break;
- case COMPLEX:
- for (int i = 0; i != n; i++) {
- if (complexLike(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
- }
- }
- break;
- }
-
- if (newSize < n) {
- batch.size = newSize;
- batch.selectedInUse = true;
- }
- }
- } else {
- if (inputColVector.isRepeating) {
+ /**
+ * Accepts simple LIKE patterns like "abc" and creates corresponding checkers.
+ */
+ private static class NoneCheckerFactory implements CheckerFactory {
+ private static final Pattern NONE_PATTERN = Pattern.compile("[^%_]+");
- //All must be selected otherwise size would be zero. Repeating property will not change.
- if (!nullPos[0]) {
- if (!like(vector[0], start[0], length[0])) {
-
- //Entire batch is filtered out.
- batch.size = 0;
- }
- } else {
- batch.size = 0;
- }
- } else if (batch.selectedInUse) {
- int newSize = 0;
-
- switch (type) {
- case NONE:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (!nullPos[i]) {
- if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case BEGIN:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (!nullPos[i]) {
- if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case END:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (!nullPos[i]) {
- if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case MIDDLE:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (!nullPos[i]) {
- if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case COMPLEX:
- for (int j = 0; j != n; j++) {
- int i = sel[j];
- if (!nullPos[i]) {
- if (complexLike(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- }
-
- //Change the selected vector
- batch.size = newSize;
- } else {
- int newSize = 0;
-
- switch (type) {
- case NONE:
- for (int i = 0; i != n; i++) {
- if (!nullPos[i]) {
- if (noneLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case BEGIN:
- for (int i = 0; i != n; i++) {
- if (!nullPos[i]) {
- if (beginLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case END:
- for (int i = 0; i != n; i++) {
- if (!nullPos[i]) {
- if (endLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case MIDDLE:
- for (int i = 0; i != n; i++) {
- if (!nullPos[i]) {
- if (midLike(vector[i], start[i], length[i], simplePatternBytes)) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- case COMPLEX:
- for (int i = 0; i != n; i++) {
- if (!nullPos[i]) {
- if (complexLike(vector[i], start[i], length[i])) {
- sel[newSize++] = i;
- }
- }
- }
- break;
- }
-
- if (newSize < n) {
- batch.size = newSize;
- batch.selectedInUse = true;
- }
-
- /* If every row qualified (newSize==n), then we can ignore the sel vector to streamline
- * future operations. So selectedInUse will remain false.
- */
+ public Checker tryCreate(String pattern) {
+ Matcher matcher = NONE_PATTERN.matcher(pattern);
+ if (matcher.matches()) {
+ return new NoneChecker(pattern);
}
+ return null;
}
}
- @Override
- public int getOutputColumn() {
- return -1;
- }
-
- @Override
- public String getOutputType() {
- return "boolean";
- }
-
- public int getColNum() {
- return colNum;
- }
-
- public void setColNum(int colNum) {
- this.colNum = colNum;
- }
-
- public Pattern getCompiledPattern() {
- return compiledPattern;
- }
-
- public void setCompiledPattern(Pattern compiledPattern) {
- this.compiledPattern = compiledPattern;
- }
-
- public void setType(PatternType type) {
- this.type = type;
- }
-
- public String getSimpleStringPattern() {
- return simpleStringPattern;
- }
-
- public void setSimpleStringPattern(String simpleStringPattern) {
- this.simpleStringPattern = simpleStringPattern;
- simplePattern.set(simpleStringPattern);
+ /**
+ * Accepts any LIKE patterns and creates corresponding checkers.
+ */
+ private static class ComplexCheckerFactory implements CheckerFactory {
+ public Checker tryCreate(String pattern) {
+ return new ComplexChecker(UDFLike.likePatternToRegExp(pattern));
+ }
}
}
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java?rev=1528917&view=auto
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java (added)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/FilterStringColRegExpStringScalar.java Thu Oct 3 16:10:32 2013
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.vector.expressions;
+
+import org.apache.hadoop.io.Text;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Evaluate REGEXP filter on a batch for a vector of strings.
+ */
+public class FilterStringColRegExpStringScalar extends AbstractFilterStringColLikeStringScalar {
+ private static final long serialVersionUID = 1L;
+
+ private static final String LITERAL_CHAR = "[^\\[\\]\\\\(){}*?+|$^.]";
+ private static final String LITERAL_CHAR_GROUP = "(" + LITERAL_CHAR + "+)";
+
+ private transient static List<CheckerFactory> checkerFactories = Arrays.asList(
+ new BeginCheckerFactory(),
+ new EndCheckerFactory(),
+ new MiddleCheckerFactory(),
+ new PhoneNumberCheckerFactory(),
+ new NoneCheckerFactory(),
+ new ComplexCheckerFactory());
+
+ public FilterStringColRegExpStringScalar() {
+ super();
+ }
+
+ public FilterStringColRegExpStringScalar(int colNum, Text regExpPattern) {
+ super(colNum, regExpPattern.toString());
+ }
+
+ @Override
+ protected List<CheckerFactory> getCheckerFactories() {
+ return checkerFactories;
+ }
+
+ /**
+ * Accepts simple REGEXP patterns like "abc.*" and creates corresponding checkers.
+ */
+ private static class BeginCheckerFactory implements CheckerFactory {
+ private static final Pattern BEGIN_PATTERN = Pattern.compile(LITERAL_CHAR_GROUP + "\\.\\*");
+
+ public Checker tryCreate(String pattern) {
+ Matcher matcher = BEGIN_PATTERN.matcher(pattern);
+ if (matcher.matches()) {
+ return new BeginChecker(matcher.group(1));
+ }
+ return null;
+ }
+ }
+
+ /**
+ * Accepts simple REGEXP patterns like ".*abc" and creates corresponding checkers.
+ */
+ private static class EndCheckerFactory implements CheckerFactory {
+ private static final Pattern END_PATTERN = Pattern.compile("\\.\\*" + LITERAL_CHAR_GROUP);
+
+ public Checker tryCreate(String pattern) {
+ Matcher matcher = END_PATTERN.matcher(pattern);
+ if (matcher.matches()) {
+ return new EndChecker(matcher.group(1));
+ }
+ return null;
+ }
+ }
+
+ /**
+ * Accepts simple REGEXP patterns like ".*abc.*" and creates corresponding checkers.
+ */
+ private static class MiddleCheckerFactory implements CheckerFactory {
+ private static final Pattern MIDDLE_PATTERN = Pattern.compile("\\.\\*" + LITERAL_CHAR_GROUP + "\\.\\*");
+
+ public Checker tryCreate(String pattern) {
+ Matcher matcher = MIDDLE_PATTERN.matcher(pattern);
+ if (matcher.matches()) {
+ return new MiddleChecker(matcher.group(1));
+ }
+ return null;
+ }
+ }
+
+ /**
+ * Accepts simple phone number regular expressions consisted only with "\(", "\)", "-", " ", "\d".
+ * For example, it accepts "(\d\d\d) \d\d\d-\d\d\d\d" then matches "(012) 345-6789".
+ */
+ private static class PhoneNumberChecker implements Checker {
+ byte[] byteSub;
+
+ PhoneNumberChecker(String pattern) {
+ this.byteSub = pattern.getBytes();
+ }
+
+ public boolean check(byte[] byteS, int start, int len) {
+ for (int i = 0; i < len; i++) {
+ byte c = byteS[start + i];
+ byte p = byteSub[i];
+ switch (p) {
+ // For pattern 'd', find digits.
+ case 'd':
+ if (!('0' <= c && c <= '9')) {
+ return false;
+ }
+ break;
+
+ // For other registered patterns, find exact matches.
+ case '-':
+ case ' ':
+ case '(':
+ case ')':
+ if (c != p) {
+ return false;
+ }
+ break;
+
+ // For unregistered patterns, fail.
+ default:
+ return false;
+ }
+ }
+ return true;
+ }
+ }
+
+ /**
+ * Accepts phone number REGEXP patterns like "\(\d\d\d\) \d\d\d-\d\d\d\d" and creates
+ * corresponding checkers.
+ */
+ private static class PhoneNumberCheckerFactory implements CheckerFactory {
+ public Checker tryCreate(String pattern) {
+ if (pattern.matches("(\\\\d|\\\\\\(|\\\\\\)|-| )+")) {
+ return new PhoneNumberChecker(pattern.replaceAll("\\\\d", "d").replaceAll("\\\\\\(", "(").replaceAll("\\\\\\)", ")"));
+ }
+ return null;
+ }
+ }
+
+ /**
+ * Accepts simple REGEXP patterns like "abc" and creates corresponding checkers.
+ */
+ private static class NoneCheckerFactory implements CheckerFactory {
+ private static final Pattern NONE_PATTERN = Pattern.compile(LITERAL_CHAR_GROUP);
+
+ public Checker tryCreate(String pattern) {
+ Matcher matcher = NONE_PATTERN.matcher(pattern);
+ if (matcher.matches()) {
+ return new NoneChecker(matcher.group(1));
+ }
+ return null;
+ }
+ }
+
+ /**
+ * Accepts any REGEXP patterns and creates corresponding checkers.
+ */
+ private static class ComplexCheckerFactory implements CheckerFactory {
+ public Checker tryCreate(String pattern) {
+ return new ComplexChecker(pattern);
+ }
+ }
+}
Modified: hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java?rev=1528917&r1=1528916&r2=1528917&view=diff
==============================================================================
--- hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java (original)
+++ hive/trunk/ql/src/test/org/apache/hadoop/hive/ql/exec/vector/expressions/TestVectorStringExpressions.java Thu Oct 3 16:10:32 2013
@@ -670,28 +670,28 @@ public class TestVectorStringExpressions
// BEGIN pattern
expr = new FilterStringColLikeStringScalar(0, new Text("abc%"));
- Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.BEGIN,
- expr.getType());
+ Assert.assertEquals(FilterStringColLikeStringScalar.BeginChecker.class,
+ expr.checker.getClass());
// END pattern
expr = new FilterStringColLikeStringScalar(0, new Text("%abc"));
- Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.END,
- expr.getType());
+ Assert.assertEquals(FilterStringColLikeStringScalar.EndChecker.class,
+ expr.checker.getClass());
// MIDDLE pattern
expr = new FilterStringColLikeStringScalar(0, new Text("%abc%"));
- Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.MIDDLE,
- expr.getType());
+ Assert.assertEquals(FilterStringColLikeStringScalar.MiddleChecker.class,
+ expr.checker.getClass());
// COMPLEX pattern
expr = new FilterStringColLikeStringScalar(0, new Text("%abc%de"));
- Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.COMPLEX,
- expr.getType());
+ Assert.assertEquals(FilterStringColLikeStringScalar.ComplexChecker.class,
+ expr.checker.getClass());
// NONE pattern
expr = new FilterStringColLikeStringScalar(0, new Text("abc"));
- Assert.assertEquals(FilterStringColLikeStringScalar.PatternType.NONE,
- expr.getType());
+ Assert.assertEquals(FilterStringColLikeStringScalar.NoneChecker.class,
+ expr.checker.getClass());
}
public void testStringLikeMultiByte() {