You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ya...@apache.org on 2020/04/23 05:33:59 UTC

[spark] branch branch-3.0 updated: [SPARK-31515][SQL] Canonicalize Cast should consider the value of needTimeZone

This is an automated email from the ASF dual-hosted git repository.

yamamuro pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 2ebef75  [SPARK-31515][SQL] Canonicalize Cast should consider the value of needTimeZone
2ebef75 is described below

commit 2ebef75ec654bdbb01a4fa6a85225a7503de84b7
Author: Yuanjian Li <xy...@gmail.com>
AuthorDate: Thu Apr 23 14:32:10 2020 +0900

    [SPARK-31515][SQL] Canonicalize Cast should consider the value of needTimeZone
    
    ### What changes were proposed in this pull request?
    Override the canonicalized fields with respect to the result of `needsTimeZone`.
    
    ### Why are the changes needed?
    The current approach breaks sematic equal of two cast expressions that don't relate with datetime type. If we don't need to use `timeZone` information casting `from` type to `to` type, then the timeZoneId should not influence the canonicalize result.
    
    ### Does this PR introduce any user-facing change?
    No.
    
    ### How was this patch tested?
    New UT added.
    
    Closes #28288 from xuanyuanking/SPARK-31515.
    
    Authored-by: Yuanjian Li <xy...@gmail.com>
    Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
    (cherry picked from commit ca90e1932dcdc43748297c627ec857b6ea97dff7)
    Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
---
 .../apache/spark/sql/catalyst/expressions/Canonicalize.scala  | 10 +++++++++-
 .../org/apache/spark/sql/catalyst/expressions/Cast.scala      |  4 +++-
 .../spark/sql/catalyst/expressions/CanonicalizeSuite.scala    | 11 ++++++++++-
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala
index 4d218b9..a803108 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala
@@ -27,6 +27,7 @@ package org.apache.spark.sql.catalyst.expressions
  * The following rules are applied:
  *  - Names and nullability hints for [[org.apache.spark.sql.types.DataType]]s are stripped.
  *  - Names for [[GetStructField]] are stripped.
+ *  - TimeZoneId for [[Cast]] and [[AnsiCast]] are stripped if `needsTimeZone` is false.
  *  - Commutative and associative operations ([[Add]] and [[Multiply]]) have their children ordered
  *    by `hashCode`.
  *  - [[EqualTo]] and [[EqualNullSafe]] are reordered by `hashCode`.
@@ -35,7 +36,7 @@ package org.apache.spark.sql.catalyst.expressions
  */
 object Canonicalize {
   def execute(e: Expression): Expression = {
-    expressionReorder(ignoreNamesTypes(e))
+    expressionReorder(ignoreTimeZone(ignoreNamesTypes(e)))
   }
 
   /** Remove names and nullability from types, and names from `GetStructField`. */
@@ -46,6 +47,13 @@ object Canonicalize {
     case _ => e
   }
 
+  /** Remove TimeZoneId for Cast if needsTimeZone return false. */
+  private[expressions] def ignoreTimeZone(e: Expression): Expression = e match {
+    case c: CastBase if c.timeZoneId.nonEmpty && !c.needsTimeZone =>
+      c.withTimeZone(null)
+    case _ => e
+  }
+
   /** Collects adjacent commutative operations. */
   private def gatherCommutative(
       e: Expression,
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 8d82956..fa615d7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -279,7 +279,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
   override lazy val resolved: Boolean =
     childrenResolved && checkInputDataTypes().isSuccess && (!needsTimeZone || timeZoneId.isDefined)
 
-  private[this] def needsTimeZone: Boolean = Cast.needsTimeZone(child.dataType, dataType)
+  def needsTimeZone: Boolean = Cast.needsTimeZone(child.dataType, dataType)
 
   // [[func]] assumes the input is no longer null because eval already does the null check.
   @inline private[this] def buildCast[T](a: Any, func: T => Any): Any = func(a.asInstanceOf[T])
@@ -1708,6 +1708,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
   """)
 case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String] = None)
   extends CastBase {
+
   override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
     copy(timeZoneId = Option(timeZoneId))
 
@@ -1724,6 +1725,7 @@ case class Cast(child: Expression, dataType: DataType, timeZoneId: Option[String
  */
 case class AnsiCast(child: Expression, dataType: DataType, timeZoneId: Option[String] = None)
   extends CastBase {
+
   override def withTimeZone(timeZoneId: String): TimeZoneAwareExpression =
     copy(timeZoneId = Option(timeZoneId))
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala
index dd71943..a043b4c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala
@@ -17,10 +17,12 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import java.util.TimeZone
+
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.plans.logical.Range
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType}
 
 class CanonicalizeSuite extends SparkFunSuite {
 
@@ -86,4 +88,11 @@ class CanonicalizeSuite extends SparkFunSuite {
     val subExpr = Subtract(range.output.head, Literal(1))
     assert(addExpr.canonicalized.hashCode() != subExpr.canonicalized.hashCode())
   }
+
+  test("SPARK-31515: Canonicalize Cast should consider the value of needTimeZone") {
+    val literal = Literal(1)
+    val cast = Cast(literal, LongType)
+    val castWithTimeZoneId = Cast(literal, LongType, Some(TimeZone.getDefault.getID))
+    assert(castWithTimeZoneId.semanticEquals(cast))
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org