You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2017/01/29 21:00:58 UTC
spark git commit: [SPARK-19385][SQL] During canonicalization, `NOT(...(l, r))` should not expect such cases that l.hashcode > r.hashcode

Repository: spark
Updated Branches:
  refs/heads/master e2e7b12ce -> ade075aed


[SPARK-19385][SQL] During canonicalization, `NOT(...(l, r))` should not expect such cases that l.hashcode > r.hashcode

## What changes were proposed in this pull request?

During canonicalization, `NOT(...(l, r))` should not expect such cases that `l.hashcode > r.hashcode`.

Take the rule `case NOT(GreaterThan(l, r)) if l.hashcode > r.hashcode` for example, it should never be matched since `GreaterThan(l, r)` itself would be re-written as `GreaterThan(r, l)` given `l.hashcode > r.hashcode` after canonicalization.

This patch consolidates rules like `case NOT(GreaterThan(l, r)) if l.hashcode > r.hashcode` and `case NOT(GreaterThan(l, r))`.

## How was this patch tested?

This patch expanded the `NOT` test case to cover both cases where:
- `l.hashcode > r.hashcode`
- `l.hashcode < r.hashcode`

Author: Liwei Lin <lw...@gmail.com>

Closes #16719 from lw-lin/canonicalize.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ade075ae
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ade075ae
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ade075ae

Branch: refs/heads/master
Commit: ade075aed483cdfc71bdbe8da912679771cef49c
Parents: e2e7b12
Author: Liwei Lin <lw...@gmail.com>
Authored: Sun Jan 29 13:00:50 2017 -0800
Committer: gatorsmile <ga...@gmail.com>
Committed: Sun Jan 29 13:00:50 2017 -0800

----------------------------------------------------------------------
 .../sql/catalyst/expressions/Canonicalize.scala |  8 ++--
 .../expressions/ExpressionSetSuite.scala        | 44 ++++++++++++++++++--
 2 files changed, 43 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/ade075ae/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala
index e876450..65e497a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala
@@ -37,7 +37,7 @@ object Canonicalize extends {
   }
 
   /** Remove names and nullability from types. */
-  private def ignoreNamesTypes(e: Expression): Expression = e match {
+  private[expressions] def ignoreNamesTypes(e: Expression): Expression = e match {
     case a: AttributeReference =>
       AttributeReference("none", a.dataType.asNullable)(exprId = a.exprId)
     case _ => e
@@ -78,13 +78,11 @@ object Canonicalize extends {
     case GreaterThanOrEqual(l, r) if l.hashCode() > r.hashCode() => LessThanOrEqual(r, l)
     case LessThanOrEqual(l, r) if l.hashCode() > r.hashCode() => GreaterThanOrEqual(r, l)
 
-    case Not(GreaterThan(l, r)) if l.hashCode() > r.hashCode() => GreaterThan(r, l)
+    // Note in the following `NOT` cases, `l.hashCode() <= r.hashCode()` holds. The reason is that
+    // canonicalization is conducted bottom-up -- see [[Expression.canonicalized]].
     case Not(GreaterThan(l, r)) => LessThanOrEqual(l, r)
-    case Not(LessThan(l, r)) if l.hashCode() > r.hashCode() => LessThan(r, l)
     case Not(LessThan(l, r)) => GreaterThanOrEqual(l, r)
-    case Not(GreaterThanOrEqual(l, r)) if l.hashCode() > r.hashCode() => GreaterThanOrEqual(r, l)
     case Not(GreaterThanOrEqual(l, r)) => LessThan(l, r)
-    case Not(LessThanOrEqual(l, r)) if l.hashCode() > r.hashCode() => LessThanOrEqual(r, l)
     case Not(LessThanOrEqual(l, r)) => GreaterThan(l, r)
 
     case _ => e

http://git-wip-us.apache.org/repos/asf/spark/blob/ade075ae/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala
index c587d4f..d617ad5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionSetSuite.scala
@@ -32,6 +32,38 @@ class ExpressionSetSuite extends SparkFunSuite {
 
   val aAndBSet = AttributeSet(aUpper :: bUpper :: Nil)
 
+  // An [AttributeReference] with almost the maximum hashcode, to make testing canonicalize rules
+  // like `case GreaterThan(l, r) if l.hashcode > r.hashcode => GreaterThan(r, l)` easier
+  val maxHash =
+    Canonicalize.ignoreNamesTypes(
+      AttributeReference("maxHash", IntegerType)(exprId =
+        new ExprId(4, NamedExpression.jvmId) {
+          // maxHash's hashcode is calculated based on this exprId's hashcode, so we set this
+          // exprId's hashCode to this specific value to make sure maxHash's hashcode is
+          // `Int.MaxValue`
+          override def hashCode: Int = -1030353449
+          // We are implementing this equals() only because the style-checking rule "you should
+          // implement equals and hashCode together" requires us to
+          override def equals(obj: Any): Boolean = super.equals(obj)
+        })).asInstanceOf[AttributeReference]
+  assert(maxHash.hashCode() == Int.MaxValue)
+
+  // An [AttributeReference] with almost the minimum hashcode, to make testing canonicalize rules
+  // like `case GreaterThan(l, r) if l.hashcode > r.hashcode => GreaterThan(r, l)` easier
+  val minHash =
+    Canonicalize.ignoreNamesTypes(
+      AttributeReference("minHash", IntegerType)(exprId =
+        new ExprId(5, NamedExpression.jvmId) {
+          // minHash's hashcode is calculated based on this exprId's hashcode, so we set this
+          // exprId's hashCode to this specific value to make sure minHash's hashcode is
+          // `Int.MinValue`
+          override def hashCode: Int = 1407330692
+          // We are implementing this equals() only because the style-checking rule "you should
+          // implement equals and hashCode together" requires us to
+          override def equals(obj: Any): Boolean = super.equals(obj)
+        })).asInstanceOf[AttributeReference]
+  assert(minHash.hashCode() == Int.MinValue)
+
   def setTest(size: Int, exprs: Expression*): Unit = {
     test(s"expect $size: ${exprs.mkString(", ")}") {
       val set = ExpressionSet(exprs)
@@ -75,10 +107,14 @@ class ExpressionSetSuite extends SparkFunSuite {
   setTest(1, aUpper >= bUpper, bUpper <= aUpper)
 
   // `Not` canonicalization
-  setTest(1, Not(aUpper > 1), aUpper <= 1, Not(Literal(1) < aUpper), Literal(1) >= aUpper)
-  setTest(1, Not(aUpper < 1), aUpper >= 1, Not(Literal(1) > aUpper), Literal(1) <= aUpper)
-  setTest(1, Not(aUpper >= 1), aUpper < 1, Not(Literal(1) <= aUpper), Literal(1) > aUpper)
-  setTest(1, Not(aUpper <= 1), aUpper > 1, Not(Literal(1) >= aUpper), Literal(1) < aUpper)
+  setTest(1, Not(maxHash > 1), maxHash <= 1, Not(Literal(1) < maxHash), Literal(1) >= maxHash)
+  setTest(1, Not(minHash > 1), minHash <= 1, Not(Literal(1) < minHash), Literal(1) >= minHash)
+  setTest(1, Not(maxHash < 1), maxHash >= 1, Not(Literal(1) > maxHash), Literal(1) <= maxHash)
+  setTest(1, Not(minHash < 1), minHash >= 1, Not(Literal(1) > minHash), Literal(1) <= minHash)
+  setTest(1, Not(maxHash >= 1), maxHash < 1, Not(Literal(1) <= maxHash), Literal(1) > maxHash)
+  setTest(1, Not(minHash >= 1), minHash < 1, Not(Literal(1) <= minHash), Literal(1) > minHash)
+  setTest(1, Not(maxHash <= 1), maxHash > 1, Not(Literal(1) >= maxHash), Literal(1) < maxHash)
+  setTest(1, Not(minHash <= 1), minHash > 1, Not(Literal(1) >= minHash), Literal(1) < minHash)
 
   // Reordering AND/OR expressions
   setTest(1, aUpper > bUpper && aUpper <= 10, aUpper <= 10 && aUpper > bUpper)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org