You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by rx...@apache.org on 2014/03/29 23:12:46 UTC

git commit: Implement the RLike & Like in catalyst

Repository: spark
Updated Branches:
  refs/heads/master 161781609 -> af3746ce0


Implement the RLike & Like in catalyst

This PR includes:
1) Unify the unit test for expression evaluation
2) Add implementation of RLike & Like

Author: Cheng Hao <ha...@intel.com>

Closes #224 from chenghao-intel/string_expression and squashes the following commits:

84f72e9 [Cheng Hao] fix bug in RLike/Like & Simplify the unit test
aeeb1d7 [Cheng Hao] Simplify the implementation/unit test of RLike/Like
319edb7 [Cheng Hao] change to spark code style
91cfd33 [Cheng Hao] add implementation for rlike/like
2c8929e [Cheng Hao] Update the unit test for expression evaluation


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/af3746ce
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/af3746ce
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/af3746ce

Branch: refs/heads/master
Commit: af3746ce0d724dc624658a2187bde188ab26d084
Parents: 1617816
Author: Cheng Hao <ha...@intel.com>
Authored: Sat Mar 29 15:12:43 2014 -0700
Committer: Reynold Xin <rx...@apache.org>
Committed: Sat Mar 29 15:12:43 2014 -0700

----------------------------------------------------------------------
 .../apache/spark/sql/catalyst/SqlParser.scala   |  6 ++
 .../apache/spark/sql/catalyst/dsl/package.scala | 10 +-
 .../catalyst/expressions/stringOperations.scala | 98 +++++++++++++++++++-
 .../expressions/ExpressionEvaluationSuite.scala | 83 +++++++++++++++++
 .../org/apache/spark/sql/hive/HiveQl.scala      |  9 +-
 5 files changed, 196 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/af3746ce/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
index 9dec4e3..0c851c2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/SqlParser.scala
@@ -114,6 +114,9 @@ class SqlParser extends StandardTokenParsers {
   protected val NULL = Keyword("NULL")
   protected val ON = Keyword("ON")
   protected val OR = Keyword("OR")
+  protected val LIKE = Keyword("LIKE")
+  protected val RLIKE = Keyword("RLIKE")
+  protected val REGEXP = Keyword("REGEXP")
   protected val ORDER = Keyword("ORDER")
   protected val OUTER = Keyword("OUTER")
   protected val RIGHT = Keyword("RIGHT")
@@ -267,6 +270,9 @@ class SqlParser extends StandardTokenParsers {
     termExpression ~ ">=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => GreaterThanOrEqual(e1, e2) } |
     termExpression ~ "!=" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(Equals(e1, e2)) } |
     termExpression ~ "<>" ~ termExpression ^^ { case e1 ~ _ ~ e2 => Not(Equals(e1, e2)) } |
+    termExpression ~ RLIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => RLike(e1, e2) } |
+    termExpression ~ REGEXP ~ termExpression ^^ { case e1 ~ _ ~ e2 => RLike(e1, e2) } |
+    termExpression ~ LIKE ~ termExpression ^^ { case e1 ~ _ ~ e2 => Like(e1, e2) } |
     termExpression ~ IN ~ "(" ~ rep1sep(termExpression, ",") <~ ")" ^^ {
       case e1 ~ _ ~ _ ~ e2 => In(e1, e2)
     } |

http://git-wip-us.apache.org/repos/asf/spark/blob/af3746ce/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
index 67cddb3..44abe67 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala
@@ -70,6 +70,9 @@ package object dsl {
     def === (other: Expression) = Equals(expr, other)
     def != (other: Expression) = Not(Equals(expr, other))
 
+    def like(other: Expression) = Like(expr, other)
+    def rlike(other: Expression) = RLike(expr, other)
+
     def asc = SortOrder(expr, Ascending)
     def desc = SortOrder(expr, Descending)
 
@@ -90,7 +93,10 @@ package object dsl {
     implicit def symbolToUnresolvedAttribute(s: Symbol) = analysis.UnresolvedAttribute(s.name)
 
     implicit class DslSymbol(sym: Symbol) extends ImplicitAttribute { def s = sym.name }
-    implicit class DslString(val s: String) extends ImplicitAttribute
+    implicit class DslString(val s: String) extends ImplicitOperators {
+      def expr: Expression = Literal(s)
+      def attr = analysis.UnresolvedAttribute(s)
+    }
 
     abstract class ImplicitAttribute extends ImplicitOperators {
       def s: String
@@ -110,6 +116,8 @@ package object dsl {
 
       // Protobuf terminology
       def required = a.withNullability(false)
+
+      def at(ordinal: Int) = BoundReference(ordinal, a)
     }
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/af3746ce/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index e195f2a..42b7a9b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -17,11 +17,103 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.util.regex.Pattern
+
+import org.apache.spark.sql.catalyst.types.DataType
+import org.apache.spark.sql.catalyst.types.StringType
 import org.apache.spark.sql.catalyst.types.BooleanType
+import org.apache.spark.sql.catalyst.trees.TreeNode
+import org.apache.spark.sql.catalyst.errors.`package`.TreeNodeException
+
+
+trait StringRegexExpression {
+  self: BinaryExpression =>
+
+  type EvaluatedType = Any
+  
+  def escape(v: String): String
+  def matches(regex: Pattern, str: String): Boolean
+  
+  def nullable: Boolean = true
+  def dataType: DataType = BooleanType
+  
+  // try cache the pattern for Literal 
+  private lazy val cache: Pattern = right match {
+    case x @ Literal(value: String, StringType) => compile(value)
+    case _ => null
+  }
+  
+  protected def compile(str: String): Pattern = if(str == null) {
+    null
+  } else {
+    // Let it raise exception if couldn't compile the regex string
+    Pattern.compile(escape(str))
+  }
 
-case class Like(left: Expression, right: Expression) extends BinaryExpression {
-  def dataType = BooleanType
-  def nullable = left.nullable // Right cannot be null.
+  protected def pattern(str: String) = if(cache == null) compile(str) else cache
+  
+  override def apply(input: Row): Any = {
+    val l = left.apply(input)
+    if(l == null) {
+      null
+    } else {
+      val r = right.apply(input)
+      if(r == null) {
+        null
+      } else {
+        val regex = pattern(r.asInstanceOf[String])
+        if(regex == null) {
+          null
+        } else {
+          matches(regex, l.asInstanceOf[String])
+        }
+      }
+    }
+  }
+}
+
+/**
+ * Simple RegEx pattern matching function
+ */
+case class Like(left: Expression, right: Expression) 
+  extends BinaryExpression with StringRegexExpression {
+  
   def symbol = "LIKE"
+    
+  // replace the _ with .{1} exactly match 1 time of any character
+  // replace the % with .*, match 0 or more times with any character
+  override def escape(v: String) = {
+    val sb = new StringBuilder()
+    var i = 0;
+    while (i < v.length) {
+      // Make a special case for "\\_" and "\\%"
+      val n = v.charAt(i);
+      if (n == '\\' && i + 1 < v.length && (v.charAt(i + 1) == '_' || v.charAt(i + 1) == '%')) {
+        sb.append(v.charAt(i + 1))
+        i += 1
+      } else {
+        if (n == '_') {
+          sb.append(".");
+        } else if (n == '%') {
+          sb.append(".*");
+        } else {
+          sb.append(Pattern.quote(Character.toString(n)));
+        }
+      }
+      
+      i += 1
+    }
+    
+    sb.toString()
+  }
+  
+  override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).matches()
 }
 
+case class RLike(left: Expression, right: Expression) 
+  extends BinaryExpression with StringRegexExpression {
+  
+  def symbol = "RLIKE"
+  override def escape(v: String): String = v
+  override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0)
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/af3746ce/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index 94894ad..52a205b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -109,4 +109,87 @@ class ExpressionEvaluationSuite extends FunSuite {
       }
     }
   }
+
+  def evaluate(expression: Expression, inputRow: Row = EmptyRow): Any = {
+    expression.apply(inputRow)
+  }
+
+  def checkEvaluation(expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = {
+    val actual = try evaluate(expression, inputRow) catch {
+      case e: Exception => fail(s"Exception evaluating $expression", e)
+    }
+    if(actual != expected) {
+      val input = if(inputRow == EmptyRow) "" else s", input: $inputRow"
+      fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input")
+    }
+  }
+
+  test("LIKE literal Regular Expression") {
+    checkEvaluation(Literal(null, StringType).like("a"), null)
+    checkEvaluation(Literal(null, StringType).like(Literal(null, StringType)), null)
+    checkEvaluation("abdef" like "abdef", true)
+    checkEvaluation("a_%b" like "a\\__b", true)
+    checkEvaluation("addb" like "a_%b", true)
+    checkEvaluation("addb" like "a\\__b", false)
+    checkEvaluation("addb" like "a%\\%b", false)
+    checkEvaluation("a_%b" like "a%\\%b", true)
+    checkEvaluation("addb" like "a%", true)
+    checkEvaluation("addb" like "**", false)
+    checkEvaluation("abc" like "a%", true)
+    checkEvaluation("abc"  like "b%", false)
+    checkEvaluation("abc"  like "bc%", false)
+  }
+  
+  test("LIKE Non-literal Regular Expression") {
+    val regEx = 'a.string.at(0)
+    checkEvaluation("abcd" like regEx, null, new GenericRow(Array[Any](null)))
+    checkEvaluation("abdef" like regEx, true, new GenericRow(Array[Any]("abdef")))
+    checkEvaluation("a_%b" like regEx, true, new GenericRow(Array[Any]("a\\__b")))
+    checkEvaluation("addb" like regEx, true, new GenericRow(Array[Any]("a_%b")))
+    checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("a\\__b")))
+    checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("a%\\%b")))
+    checkEvaluation("a_%b" like regEx, true, new GenericRow(Array[Any]("a%\\%b")))
+    checkEvaluation("addb" like regEx, true, new GenericRow(Array[Any]("a%")))
+    checkEvaluation("addb" like regEx, false, new GenericRow(Array[Any]("**")))
+    checkEvaluation("abc" like regEx, true, new GenericRow(Array[Any]("a%")))
+    checkEvaluation("abc" like regEx, false, new GenericRow(Array[Any]("b%")))
+    checkEvaluation("abc" like regEx, false, new GenericRow(Array[Any]("bc%")))
+  }
+
+  test("RLIKE literal Regular Expression") {
+    checkEvaluation("abdef" rlike "abdef", true)
+    checkEvaluation("abbbbc" rlike "a.*c", true)
+    
+    checkEvaluation("fofo" rlike "^fo", true)
+    checkEvaluation("fo\no" rlike "^fo\no$", true)
+    checkEvaluation("Bn" rlike "^Ba*n", true)
+    checkEvaluation("afofo" rlike "fo", true)
+    checkEvaluation("afofo" rlike "^fo", false)
+    checkEvaluation("Baan" rlike "^Ba?n", false)
+    checkEvaluation("axe" rlike "pi|apa", false)
+    checkEvaluation("pip" rlike "^(pi)*$", false)
+
+    checkEvaluation("abc"  rlike "^ab", true)
+    checkEvaluation("abc"  rlike "^bc", false)
+    checkEvaluation("abc"  rlike "^ab", true)
+    checkEvaluation("abc"  rlike "^bc", false)
+
+    intercept[java.util.regex.PatternSyntaxException] {
+      evaluate("abbbbc" rlike "**")
+    }
+  }
+
+  test("RLIKE Non-literal Regular Expression") {
+    val regEx = 'a.string.at(0)
+    checkEvaluation("abdef" rlike regEx, true, new GenericRow(Array[Any]("abdef")))
+    checkEvaluation("abbbbc" rlike regEx, true, new GenericRow(Array[Any]("a.*c")))
+    checkEvaluation("fofo" rlike regEx, true, new GenericRow(Array[Any]("^fo")))
+    checkEvaluation("fo\no" rlike regEx, true, new GenericRow(Array[Any]("^fo\no$")))
+    checkEvaluation("Bn" rlike regEx, true, new GenericRow(Array[Any]("^Ba*n")))
+
+    intercept[java.util.regex.PatternSyntaxException] {
+      evaluate("abbbbc" rlike regEx, new GenericRow(Array[Any]("**")))
+    }
+  }
 }
+

http://git-wip-us.apache.org/repos/asf/spark/blob/af3746ce/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index b70ec89..490a592 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -847,12 +847,9 @@ object HiveQl {
     case Token(">=", left :: right:: Nil) => GreaterThanOrEqual(nodeToExpr(left), nodeToExpr(right))
     case Token("<", left :: right:: Nil) => LessThan(nodeToExpr(left), nodeToExpr(right))
     case Token("<=", left :: right:: Nil) => LessThanOrEqual(nodeToExpr(left), nodeToExpr(right))
-    case Token("LIKE", left :: right:: Nil) =>
-      UnresolvedFunction("LIKE", Seq(nodeToExpr(left), nodeToExpr(right)))
-    case Token("RLIKE", left :: right:: Nil) =>
-      UnresolvedFunction("RLIKE", Seq(nodeToExpr(left), nodeToExpr(right)))
-    case Token("REGEXP", left :: right:: Nil) =>
-      UnresolvedFunction("REGEXP", Seq(nodeToExpr(left), nodeToExpr(right)))
+    case Token("LIKE", left :: right:: Nil) => Like(nodeToExpr(left), nodeToExpr(right))
+    case Token("RLIKE", left :: right:: Nil) => RLike(nodeToExpr(left), nodeToExpr(right))
+    case Token("REGEXP", left :: right:: Nil) => RLike(nodeToExpr(left), nodeToExpr(right))
     case Token("TOK_FUNCTION", Token("TOK_ISNOTNULL", Nil) :: child :: Nil) =>
       IsNotNull(nodeToExpr(child))
     case Token("TOK_FUNCTION", Token("TOK_ISNULL", Nil) :: child :: Nil) =>