You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ap...@apache.org on 2015/05/20 00:30:19 UTC

hive git commit: HIVE-2327 Optimize REGEX UDFs with constant parameter information (Alexander Pivovarov, reviewed by Ashutosh Chauhan)

Repository: hive
Updated Branches:
  refs/heads/master c567a77c2 -> 312711b70


HIVE-2327 Optimize REGEX UDFs with constant parameter information (Alexander Pivovarov, reviewed by Ashutosh Chauhan)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/312711b7
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/312711b7
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/312711b7

Branch: refs/heads/master
Commit: 312711b705b2af950c475572785fc19660ec1c38
Parents: c567a77
Author: Alexander Pivovarov <ap...@gmail.com>
Authored: Thu Apr 2 22:35:38 2015 -0700
Committer: Alexander Pivovarov <ap...@gmail.com>
Committed: Tue May 19 15:29:20 2015 -0700

----------------------------------------------------------------------
 .../hadoop/hive/ql/exec/FunctionRegistry.java   |   5 +-
 .../hive/ql/optimizer/physical/Vectorizer.java  |   3 +-
 .../apache/hadoop/hive/ql/udf/UDFRegExp.java    |  76 -----------
 .../hive/ql/udf/generic/GenericUDFRegExp.java   | 133 ++++++++++++++++++
 .../ql/udf/generic/TestGenericUDFRegexp.java    | 135 +++++++++++++++++++
 .../spark/vectorization_short_regress.q.out     |   8 +-
 .../tez/vectorization_short_regress.q.out       |   8 +-
 .../vectorization_short_regress.q.out           |   8 +-
 8 files changed, 283 insertions(+), 93 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
index 7ce0a1c..9abe15e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java
@@ -81,7 +81,6 @@ import org.apache.hadoop.hive.ql.udf.UDFPI;
 import org.apache.hadoop.hive.ql.udf.UDFParseUrl;
 import org.apache.hadoop.hive.ql.udf.UDFRadians;
 import org.apache.hadoop.hive.ql.udf.UDFRand;
-import org.apache.hadoop.hive.ql.udf.UDFRegExp;
 import org.apache.hadoop.hive.ql.udf.UDFRegExpExtract;
 import org.apache.hadoop.hive.ql.udf.UDFRegExpReplace;
 import org.apache.hadoop.hive.ql.udf.UDFRepeat;
@@ -249,8 +248,8 @@ public final class FunctionRegistry {
     system.registerGenericUDF("initcap", GenericUDFInitCap.class);
 
     system.registerUDF("like", UDFLike.class, true);
-    system.registerUDF("rlike", UDFRegExp.class, true);
-    system.registerUDF("regexp", UDFRegExp.class, true);
+    system.registerGenericUDF("rlike", GenericUDFRegExp.class);
+    system.registerGenericUDF("regexp", GenericUDFRegExp.class);
     system.registerUDF("regexp_replace", UDFRegExpReplace.class, false);
     system.registerUDF("regexp_extract", UDFRegExpExtract.class, false);
     system.registerUDF("parse_url", UDFParseUrl.class, false);

http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
index 656a5e3..705b185 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
@@ -119,7 +119,6 @@ import org.apache.hadoop.hive.ql.udf.UDFMinute;
 import org.apache.hadoop.hive.ql.udf.UDFMonth;
 import org.apache.hadoop.hive.ql.udf.UDFRadians;
 import org.apache.hadoop.hive.ql.udf.UDFRand;
-import org.apache.hadoop.hive.ql.udf.UDFRegExp;
 import org.apache.hadoop.hive.ql.udf.UDFSecond;
 import org.apache.hadoop.hive.ql.udf.UDFSign;
 import org.apache.hadoop.hive.ql.udf.UDFSin;
@@ -227,7 +226,7 @@ public class Vectorizer implements PhysicalPlanResolver {
     supportedGenericUDFs.add(GenericUDFDateDiff.class);
 
     supportedGenericUDFs.add(UDFLike.class);
-    supportedGenericUDFs.add(UDFRegExp.class);
+    supportedGenericUDFs.add(GenericUDFRegExp.class);
     supportedGenericUDFs.add(UDFSubstr.class);
     supportedGenericUDFs.add(GenericUDFLTrim.class);
     supportedGenericUDFs.add(GenericUDFRTrim.class);

http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFRegExp.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFRegExp.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFRegExp.java
deleted file mode 100755
index 76e1d2e..0000000
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFRegExp.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.hive.ql.udf;
-
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.hadoop.hive.ql.exec.Description;
-import org.apache.hadoop.hive.ql.exec.UDF;
-import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
-import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColRegExpStringScalar;
-import org.apache.hadoop.io.BooleanWritable;
-import org.apache.hadoop.io.Text;
-
-/**
- * UDFRegExp.
- *
- */
-@Description(name = "rlike,regexp",
-    value = "str _FUNC_ regexp - Returns true if str matches regexp and "
-    + "false otherwise", extended = "Example:\n"
-    + "  > SELECT 'fb' _FUNC_ '.*' FROM src LIMIT 1;\n" + "  true")
-@VectorizedExpressions({FilterStringColRegExpStringScalar.class})
-public class UDFRegExp extends UDF {
-  static final Log LOG = LogFactory.getLog(UDFRegExp.class.getName());
-
-  private final Text lastRegex = new Text();
-  private Pattern p = null;
-  private boolean warned = false;
-
-  private final BooleanWritable result = new BooleanWritable();
-
-  public UDFRegExp() {
-  }
-
-  public BooleanWritable evaluate(Text s, Text regex) {
-    if (s == null || regex == null) {
-      return null;
-    }
-    if (regex.getLength() == 0) {
-      if (!warned) {
-        warned = true;
-        LOG.warn(getClass().getSimpleName() + " regex is empty. Additional "
-            + "warnings for an empty regex will be suppressed.");
-      }
-      result.set(false);
-      return result;
-    }
-    if (!regex.equals(lastRegex) || p == null) {
-      lastRegex.set(regex);
-      p = Pattern.compile(regex.toString());
-    }
-    Matcher m = p.matcher(s.toString());
-    result.set(m.find(0));
-    return result;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java
new file mode 100644
index 0000000..0a9dd7b
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
+import org.apache.hadoop.hive.ql.exec.vector.VectorizedExpressions;
+import org.apache.hadoop.hive.ql.exec.vector.expressions.FilterStringColRegExpStringScalar;
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
+import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.io.BooleanWritable;
+
+/**
+ * UDF to extract a specific group identified by a java regex. Note that if a
+ * regexp has a backslash ('\'), then need to specify '\\' For example,
+ * regexp_extract('100-200', '(\\d+)-(\\d+)', 1) will return '100'
+ */
+@Description(name = "rlike,regexp",
+    value = "str _FUNC_ regexp - Returns true if str matches regexp and "
+    + "false otherwise", extended = "Example:\n"
+    + "  > SELECT 'fb' _FUNC_ '.*' FROM src LIMIT 1;\n" + "  true")
+@VectorizedExpressions({FilterStringColRegExpStringScalar.class})
+public class GenericUDFRegExp extends GenericUDF {
+  static final Log LOG = LogFactory.getLog(GenericUDFRegExp.class.getName());
+  private transient PrimitiveCategory[] inputTypes = new PrimitiveCategory[2];
+  private transient Converter[] converters = new Converter[2];
+  private final BooleanWritable output = new BooleanWritable();
+  private transient boolean isRegexConst;
+  private transient String regexConst;
+  private transient Pattern patternConst;
+  private transient boolean warned;
+
+  @Override
+  public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
+    checkArgsSize(arguments, 2, 2);
+
+    checkArgPrimitive(arguments, 0);
+    checkArgPrimitive(arguments, 1);
+
+    checkArgGroups(arguments, 0, inputTypes, STRING_GROUP);
+    checkArgGroups(arguments, 1, inputTypes, STRING_GROUP);
+
+    obtainStringConverter(arguments, 0, inputTypes, converters);
+    obtainStringConverter(arguments, 1, inputTypes, converters);
+
+    if (arguments[1] instanceof ConstantObjectInspector) {
+      regexConst = getConstantStringValue(arguments, 1);
+      if (regexConst != null) {
+        patternConst = Pattern.compile(regexConst);
+      }
+      isRegexConst = true;
+    }
+
+    ObjectInspector outputOI = PrimitiveObjectInspectorFactory.writableBooleanObjectInspector;
+    return outputOI;
+  }
+
+  @Override
+  public Object evaluate(DeferredObject[] arguments) throws HiveException {
+    String s = getStringValue(arguments, 0, converters);
+    if (s == null) {
+      return null;
+    }
+
+    String regex;
+    if (isRegexConst) {
+      regex = regexConst;
+    } else {
+      regex = getStringValue(arguments, 1, converters);
+    }
+    if (regex == null) {
+      return null;
+    }
+
+    if (regex.length() == 0) {
+      if (!warned) {
+        warned = true;
+        LOG.warn(getClass().getSimpleName() + " regex is empty. Additional "
+            + "warnings for an empty regex will be suppressed.");
+      }
+      output.set(false);
+      return output;
+    }
+
+    Pattern p;
+    if (isRegexConst) {
+      p = patternConst;
+    } else {
+      p = Pattern.compile(regex);
+    }
+
+    Matcher m = p.matcher(s);
+    output.set(m.find(0));
+    return output;
+  }
+
+  @Override
+  public String getDisplayString(String[] children) {
+    return children[0] + " regexp " + children[1];
+  }
+
+  @Override
+  protected String getFuncName() {
+    return "regexp";
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFRegexp.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFRegexp.java b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFRegexp.java
new file mode 100644
index 0000000..4e3be90
--- /dev/null
+++ b/ql/src/test/org/apache/hadoop/hive/ql/udf/generic/TestGenericUDFRegexp.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.udf.generic;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.hive.ql.metadata.HiveException;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
+import org.apache.hadoop.io.BooleanWritable;
+import org.apache.hadoop.io.Text;
+
+public class TestGenericUDFRegexp extends TestCase {
+
+  public void testConstant() throws HiveException {
+    GenericUDFRegExp udf = new GenericUDFRegExp();
+    ObjectInspector valueOI0 = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+    Text regexText = new Text("^fo");
+    ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory
+        .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, regexText);
+    ObjectInspector[] arguments = { valueOI0, valueOI1 };
+
+    udf.initialize(arguments);
+
+    runAndVerifyConst("fofo", regexText, true, udf);
+    runAndVerifyConst("fofofo", regexText, true, udf);
+    runAndVerifyConst("fobar", regexText, true, udf);
+    runAndVerifyConst("barfobar", regexText, false, udf);
+    // null
+    runAndVerifyConst(null, regexText, null, udf);
+  }
+
+  public void testEmptyConstant() throws HiveException {
+    GenericUDFRegExp udf = new GenericUDFRegExp();
+    ObjectInspector valueOI0 = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+    Text regexText = new Text("");
+    ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory
+        .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, regexText);
+    ObjectInspector[] arguments = { valueOI0, valueOI1 };
+
+    udf.initialize(arguments);
+
+    // empty regex (should be one WARN message)
+    runAndVerifyConst("foo", regexText, false, udf);
+    runAndVerifyConst("bar", regexText, false, udf);
+    // null
+    runAndVerifyConst(null, regexText, null, udf);
+  }
+
+  public void testNullConstant() throws HiveException {
+    GenericUDFRegExp udf = new GenericUDFRegExp();
+    ObjectInspector valueOI0 = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+    Text regexText = null;
+    ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory
+        .getPrimitiveWritableConstantObjectInspector(TypeInfoFactory.stringTypeInfo, regexText);
+    ObjectInspector[] arguments = { valueOI0, valueOI1 };
+
+    udf.initialize(arguments);
+    // null
+    runAndVerifyConst("fofo", regexText, null, udf);
+    runAndVerifyConst("fofofo", regexText, null, udf);
+    runAndVerifyConst("fobar", regexText, null, udf);
+    runAndVerifyConst(null, regexText, null, udf);
+  }
+
+  public void testNonConstant() throws HiveException {
+    GenericUDFRegExp udf = new GenericUDFRegExp();
+    ObjectInspector valueOI0 = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+    ObjectInspector valueOI1 = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
+    ObjectInspector[] arguments = { valueOI0, valueOI1 };
+
+    udf.initialize(arguments);
+
+    runAndVerify("fofo", "^fo", true, udf);
+    runAndVerify("fo\no", "^fo\no$", true, udf);
+    runAndVerify("Bn", "^Ba*n", true, udf);
+    runAndVerify("afofo", "fo", true, udf);
+    runAndVerify("afofo", "^fo", false, udf);
+    runAndVerify("Baan", "^Ba?n", false, udf);
+    runAndVerify("axe", "pi|apa", false, udf);
+    runAndVerify("pip", "^(pi)*$", false, udf);
+    // empty regex (should be one WARN message)
+    runAndVerify("bar", "", false, udf);
+    runAndVerify("foo", "", false, udf);
+    // null
+    runAndVerify(null, "^fo", null, udf);
+    runAndVerify("fofo", null, null, udf);
+  }
+
+  private void runAndVerifyConst(String str, Text regexText, Boolean expResult, GenericUDF udf)
+      throws HiveException {
+    DeferredObject valueObj0 = new DeferredJavaObject(str != null ? new Text(str) : null);
+    DeferredObject valueObj1 = new DeferredJavaObject(regexText);
+    DeferredObject[] args = { valueObj0, valueObj1 };
+    BooleanWritable output = (BooleanWritable) udf.evaluate(args);
+    if (expResult == null) {
+      assertNull(output);
+    } else {
+      assertNotNull(output);
+      assertEquals("regexp() const test ", expResult.booleanValue(), output.get());
+    }
+  }
+
+  private void runAndVerify(String str, String regex, Boolean expResult, GenericUDF udf)
+      throws HiveException {
+    DeferredObject valueObj0 = new DeferredJavaObject(str != null ? new Text(str) : null);
+    DeferredObject valueObj1 = new DeferredJavaObject(regex != null ? new Text(regex) : null);
+    DeferredObject[] args = { valueObj0, valueObj1 };
+    BooleanWritable output = (BooleanWritable) udf.evaluate(args);
+    if (expResult == null) {
+      assertNull(output);
+    } else {
+      assertNotNull(output);
+      assertEquals("regexp() test ", expResult.booleanValue(), output.get());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out b/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out
index a4b8e05..25eb161 100644
--- a/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out
+++ b/ql/src/test/results/clientpositive/spark/vectorization_short_regress.q.out
@@ -360,7 +360,7 @@ STAGE PLANS:
                   alias: alltypesorc
                   Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
                   Filter Operator
-                    predicate: (((((cbigint <= 197) and (cint < cbigint)) or ((cdouble >= -26.28) and (csmallint > cdouble))) or ((ctinyint > cfloat) and (cstring1 rlike '.*ss.*'))) or ((cfloat > 79.553) and (cstring2 like '10%'))) (type: boolean)
+                    predicate: (((((cbigint <= 197) and (cint < cbigint)) or ((cdouble >= -26.28) and (csmallint > cdouble))) or ((ctinyint > cfloat) and cstring1 regexp '.*ss.*')) or ((cfloat > 79.553) and (cstring2 like '10%'))) (type: boolean)
                     Statistics: Num rows: 6826 Data size: 209555 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
                       expressions: cint (type: int), cbigint (type: bigint), csmallint (type: smallint), cdouble (type: double), ctinyint (type: tinyint)
@@ -935,7 +935,7 @@ STAGE PLANS:
                   alias: alltypesorc
                   Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
                   Filter Operator
-                    predicate: (((((cstring1 rlike 'a.*') and (cstring2 like '%ss%')) or ((1 <> cboolean2) and ((csmallint < 79.553) and (-257 <> ctinyint)))) or ((cdouble > ctinyint) and (cfloat >= cint))) or ((cint < cbigint) and (ctinyint > cbigint))) (type: boolean)
+                    predicate: ((((cstring1 regexp 'a.*' and (cstring2 like '%ss%')) or ((1 <> cboolean2) and ((csmallint < 79.553) and (-257 <> ctinyint)))) or ((cdouble > ctinyint) and (cfloat >= cint))) or ((cint < cbigint) and (ctinyint > cbigint))) (type: boolean)
                     Statistics: Num rows: 9898 Data size: 303864 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
                       expressions: cint (type: int), cdouble (type: double), ctimestamp2 (type: timestamp), cstring1 (type: string), cboolean2 (type: boolean), ctinyint (type: tinyint), cfloat (type: float), ctimestamp1 (type: timestamp), csmallint (type: smallint), cbigint (type: bigint), (-3728 * cbigint) (type: bigint), (- cint) (type: int), (-863.257 - cint) (type: double), (- csmallint) (type: smallint), (csmallint - (- csmallint)) (type: smallint), ((csmallint - (- csmallint)) + (- csmallint)) (type: smallint), (cint / cint) (type: double), ((-863.257 - cint) - -26.28) (type: double), (- cfloat) (type: float), (cdouble * -89010) (type: double), (ctinyint / 988888) (type: double), (- ctinyint) (type: tinyint), (79.553 / ctinyint) (type: double)
@@ -2339,7 +2339,7 @@ STAGE PLANS:
                   alias: alltypesorc
                   Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
                   Filter Operator
-                    predicate: ((ctimestamp1 <> 0) and (((((((-257 <> ctinyint) and cboolean2 is not null) and ((cstring1 rlike '.*ss') and (-3 < ctimestamp1))) or (ctimestamp2 = -5)) or ((ctimestamp1 < 0) and (cstring2 like '%b%'))) or (cdouble = cint)) or (cboolean1 is null and (cfloat < cint)))) (type: boolean)
+                    predicate: ((ctimestamp1 <> 0) and (((((((-257 <> ctinyint) and cboolean2 is not null) and (cstring1 regexp '.*ss' and (-3 < ctimestamp1))) or (ctimestamp2 = -5)) or ((ctimestamp1 < 0) and (cstring2 like '%b%'))) or (cdouble = cint)) or (cboolean1 is null and (cfloat < cint)))) (type: boolean)
                     Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
                       expressions: ctimestamp1 (type: timestamp), cstring1 (type: string), cint (type: int), csmallint (type: smallint), ctinyint (type: tinyint), cfloat (type: float), cdouble (type: double)
@@ -2672,7 +2672,7 @@ STAGE PLANS:
                   alias: alltypesorc
                   Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE
                   Filter Operator
-                    predicate: (cboolean1 is not null and (((((cdouble < csmallint) and ((cboolean2 = cboolean1) and (cbigint <= -863.257))) or ((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1)))) or (cstring2 rlike 'b')) or ((csmallint >= ctinyint) and ctimestamp2 is null))) (type: boolean)
+                    predicate: (cboolean1 is not null and (((((cdouble < csmallint) and ((cboolean2 = cboolean1) and (cbigint <= -863.257))) or ((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1)))) or cstring2 regexp 'b') or ((csmallint >= ctinyint) and ctimestamp2 is null))) (type: boolean)
                     Statistics: Num rows: 4778 Data size: 146682 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
                       expressions: cboolean1 (type: boolean), cfloat (type: float), cbigint (type: bigint), cint (type: int), cdouble (type: double), ctinyint (type: tinyint), csmallint (type: smallint)

http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out b/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out
index a3c723d..bf01f78 100644
--- a/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out
+++ b/ql/src/test/results/clientpositive/tez/vectorization_short_regress.q.out
@@ -360,7 +360,7 @@ STAGE PLANS:
                   alias: alltypesorc
                   Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
                   Filter Operator
-                    predicate: (((cbigint <= 197) and (UDFToLong(cint) < cbigint)) or (((cdouble >= -26.28) and (UDFToDouble(csmallint) > cdouble)) or (((UDFToFloat(ctinyint) > cfloat) and (cstring1 rlike '.*ss.*')) or ((cfloat > 79.553) and (cstring2 like '10%'))))) (type: boolean)
+                    predicate: (((cbigint <= 197) and (UDFToLong(cint) < cbigint)) or (((cdouble >= -26.28) and (UDFToDouble(csmallint) > cdouble)) or (((UDFToFloat(ctinyint) > cfloat) and cstring1 regexp '.*ss.*') or ((cfloat > 79.553) and (cstring2 like '10%'))))) (type: boolean)
                     Statistics: Num rows: 6826 Data size: 1467614 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
                       expressions: cint (type: int), cbigint (type: bigint), csmallint (type: smallint), cdouble (type: double), ctinyint (type: tinyint)
@@ -935,7 +935,7 @@ STAGE PLANS:
                   alias: alltypesorc
                   Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
                   Filter Operator
-                    predicate: (((cstring1 rlike 'a.*') and (cstring2 like '%ss%')) or (((1 <> cboolean2) and ((UDFToDouble(csmallint) < 79.553) and (-257 <> UDFToInteger(ctinyint)))) or (((cdouble > UDFToDouble(ctinyint)) and (cfloat >= UDFToFloat(cint))) or ((UDFToLong(cint) < cbigint) and (UDFToLong(ctinyint) > cbigint))))) (type: boolean)
+                    predicate: ((cstring1 regexp 'a.*' and (cstring2 like '%ss%')) or (((1 <> cboolean2) and ((UDFToDouble(csmallint) < 79.553) and (-257 <> UDFToInteger(ctinyint)))) or (((cdouble > UDFToDouble(ctinyint)) and (cfloat >= UDFToFloat(cint))) or ((UDFToLong(cint) < cbigint) and (UDFToLong(ctinyint) > cbigint))))) (type: boolean)
                     Statistics: Num rows: 9898 Data size: 2128105 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
                       expressions: cint (type: int), cdouble (type: double), ctimestamp2 (type: timestamp), cstring1 (type: string), cboolean2 (type: boolean), ctinyint (type: tinyint), cfloat (type: float), ctimestamp1 (type: timestamp), csmallint (type: smallint), cbigint (type: bigint), (-3728 * cbigint) (type: bigint), (- cint) (type: int), (-863.257 - UDFToDouble(cint)) (type: double), (- csmallint) (type: smallint), (csmallint - (- csmallint)) (type: smallint), ((csmallint - (- csmallint)) + (- csmallint)) (type: smallint), (UDFToDouble(cint) / UDFToDouble(cint)) (type: double), ((-863.257 - UDFToDouble(cint)) - -26.28) (type: double), (- cfloat) (type: float), (cdouble * -89010.0) (type: double), (UDFToDouble(ctinyint) / 988888.0) (type: double), (- ctinyint) (type: tinyint), (79.553 / UDFToDouble(ctinyint)) (type: double)
@@ -2339,7 +2339,7 @@ STAGE PLANS:
                   alias: alltypesorc
                   Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
                   Filter Operator
-                    predicate: ((UDFToDouble(ctimestamp1) <> 0.0) and (((-257 <> UDFToInteger(ctinyint)) and (cboolean2 is not null and ((cstring1 rlike '.*ss') and (-3.0 < UDFToDouble(ctimestamp1))))) or ((UDFToDouble(ctimestamp2) = -5.0) or (((UDFToDouble(ctimestamp1) < 0.0) and (cstring2 like '%b%')) or ((cdouble = UDFToDouble(cint)) or (cboolean1 is null and (cfloat < UDFToFloat(cint)))))))) (type: boolean)
+                    predicate: ((UDFToDouble(ctimestamp1) <> 0.0) and (((-257 <> UDFToInteger(ctinyint)) and (cboolean2 is not null and (cstring1 regexp '.*ss' and (-3.0 < UDFToDouble(ctimestamp1))))) or ((UDFToDouble(ctimestamp2) = -5.0) or (((UDFToDouble(ctimestamp1) < 0.0) and (cstring2 like '%b%')) or ((cdouble = UDFToDouble(cint)) or (cboolean1 is null and (cfloat < UDFToFloat(cint)))))))) (type: boolean)
                     Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
                       expressions: ctimestamp1 (type: timestamp), cstring1 (type: string), cint (type: int), csmallint (type: smallint), ctinyint (type: tinyint), cfloat (type: float), cdouble (type: double)
@@ -2672,7 +2672,7 @@ STAGE PLANS:
                   alias: alltypesorc
                   Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
                   Filter Operator
-                    predicate: (cboolean1 is not null and (((cdouble < UDFToDouble(csmallint)) and ((cboolean2 = cboolean1) and (UDFToDouble(cbigint) <= -863.257))) or (((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1))) or ((cstring2 rlike 'b') or ((csmallint >= UDFToShort(ctinyint)) and ctimestamp2 is null))))) (type: boolean)
+                    predicate: (cboolean1 is not null and (((cdouble < UDFToDouble(csmallint)) and ((cboolean2 = cboolean1) and (UDFToDouble(cbigint) <= -863.257))) or (((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1))) or (cstring2 regexp 'b' or ((csmallint >= UDFToShort(ctinyint)) and ctimestamp2 is null))))) (type: boolean)
                     Statistics: Num rows: 4778 Data size: 1027287 Basic stats: COMPLETE Column stats: NONE
                     Select Operator
                       expressions: cboolean1 (type: boolean), cfloat (type: float), cbigint (type: bigint), cint (type: int), cdouble (type: double), ctinyint (type: tinyint), csmallint (type: smallint)

http://git-wip-us.apache.org/repos/asf/hive/blob/312711b7/ql/src/test/results/clientpositive/vectorization_short_regress.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vectorization_short_regress.q.out b/ql/src/test/results/clientpositive/vectorization_short_regress.q.out
index b9ab174..b823d4b 100644
--- a/ql/src/test/results/clientpositive/vectorization_short_regress.q.out
+++ b/ql/src/test/results/clientpositive/vectorization_short_regress.q.out
@@ -349,7 +349,7 @@ STAGE PLANS:
             alias: alltypesorc
             Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
             Filter Operator
-              predicate: (((cbigint <= 197) and (UDFToLong(cint) < cbigint)) or (((cdouble >= -26.28) and (UDFToDouble(csmallint) > cdouble)) or (((UDFToFloat(ctinyint) > cfloat) and (cstring1 rlike '.*ss.*')) or ((cfloat > 79.553) and (cstring2 like '10%'))))) (type: boolean)
+              predicate: (((cbigint <= 197) and (UDFToLong(cint) < cbigint)) or (((cdouble >= -26.28) and (UDFToDouble(csmallint) > cdouble)) or (((UDFToFloat(ctinyint) > cfloat) and cstring1 regexp '.*ss.*') or ((cfloat > 79.553) and (cstring2 like '10%'))))) (type: boolean)
               Statistics: Num rows: 6826 Data size: 1467614 Basic stats: COMPLETE Column stats: NONE
               Select Operator
                 expressions: cint (type: int), cbigint (type: bigint), csmallint (type: smallint), cdouble (type: double), ctinyint (type: tinyint)
@@ -906,7 +906,7 @@ STAGE PLANS:
             alias: alltypesorc
             Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
             Filter Operator
-              predicate: (((cstring1 rlike 'a.*') and (cstring2 like '%ss%')) or (((1 <> cboolean2) and ((UDFToDouble(csmallint) < 79.553) and (-257 <> UDFToInteger(ctinyint)))) or (((cdouble > UDFToDouble(ctinyint)) and (cfloat >= UDFToFloat(cint))) or ((UDFToLong(cint) < cbigint) and (UDFToLong(ctinyint) > cbigint))))) (type: boolean)
+              predicate: ((cstring1 regexp 'a.*' and (cstring2 like '%ss%')) or (((1 <> cboolean2) and ((UDFToDouble(csmallint) < 79.553) and (-257 <> UDFToInteger(ctinyint)))) or (((cdouble > UDFToDouble(ctinyint)) and (cfloat >= UDFToFloat(cint))) or ((UDFToLong(cint) < cbigint) and (UDFToLong(ctinyint) > cbigint))))) (type: boolean)
               Statistics: Num rows: 9898 Data size: 2128105 Basic stats: COMPLETE Column stats: NONE
               Select Operator
                 expressions: cint (type: int), cdouble (type: double), ctimestamp2 (type: timestamp), cstring1 (type: string), cboolean2 (type: boolean), ctinyint (type: tinyint), cfloat (type: float), ctimestamp1 (type: timestamp), csmallint (type: smallint), cbigint (type: bigint), (-3728 * cbigint) (type: bigint), (- cint) (type: int), (-863.257 - UDFToDouble(cint)) (type: double), (- csmallint) (type: smallint), (csmallint - (- csmallint)) (type: smallint), ((csmallint - (- csmallint)) + (- csmallint)) (type: smallint), (UDFToDouble(cint) / UDFToDouble(cint)) (type: double), ((-863.257 - UDFToDouble(cint)) - -26.28) (type: double), (- cfloat) (type: float), (cdouble * -89010.0) (type: double), (UDFToDouble(ctinyint) / 988888.0) (type: double), (- ctinyint) (type: tinyint), (79.553 / UDFToDouble(ctinyint)) (type: double)
@@ -2288,7 +2288,7 @@ STAGE PLANS:
             alias: alltypesorc
             Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
             Filter Operator
-              predicate: ((UDFToDouble(ctimestamp1) <> 0.0) and (((-257 <> UDFToInteger(ctinyint)) and (cboolean2 is not null and ((cstring1 rlike '.*ss') and (-3.0 < UDFToDouble(ctimestamp1))))) or ((UDFToDouble(ctimestamp2) = -5.0) or (((UDFToDouble(ctimestamp1) < 0.0) and (cstring2 like '%b%')) or ((cdouble = UDFToDouble(cint)) or (cboolean1 is null and (cfloat < UDFToFloat(cint)))))))) (type: boolean)
+              predicate: ((UDFToDouble(ctimestamp1) <> 0.0) and (((-257 <> UDFToInteger(ctinyint)) and (cboolean2 is not null and (cstring1 regexp '.*ss' and (-3.0 < UDFToDouble(ctimestamp1))))) or ((UDFToDouble(ctimestamp2) = -5.0) or (((UDFToDouble(ctimestamp1) < 0.0) and (cstring2 like '%b%')) or ((cdouble = UDFToDouble(cint)) or (cboolean1 is null and (cfloat < UDFToFloat(cint)))))))) (type: boolean)
               Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
               Select Operator
                 expressions: ctimestamp1 (type: timestamp), cstring1 (type: string), cint (type: int), csmallint (type: smallint), ctinyint (type: tinyint), cfloat (type: float), cdouble (type: double)
@@ -2624,7 +2624,7 @@ STAGE PLANS:
             alias: alltypesorc
             Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE
             Filter Operator
-              predicate: (cboolean1 is not null and (((cdouble < UDFToDouble(csmallint)) and ((cboolean2 = cboolean1) and (UDFToDouble(cbigint) <= -863.257))) or (((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1))) or ((cstring2 rlike 'b') or ((csmallint >= UDFToShort(ctinyint)) and ctimestamp2 is null))))) (type: boolean)
+              predicate: (cboolean1 is not null and (((cdouble < UDFToDouble(csmallint)) and ((cboolean2 = cboolean1) and (UDFToDouble(cbigint) <= -863.257))) or (((cint >= -257) and (cstring1 is not null and (cboolean1 >= 1))) or (cstring2 regexp 'b' or ((csmallint >= UDFToShort(ctinyint)) and ctimestamp2 is null))))) (type: boolean)
               Statistics: Num rows: 4778 Data size: 1027287 Basic stats: COMPLETE Column stats: NONE
               Select Operator
                 expressions: cboolean1 (type: boolean), cfloat (type: float), cbigint (type: bigint), cint (type: int), cdouble (type: double), ctinyint (type: tinyint), csmallint (type: smallint)