You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2014/09/20 00:29:44 UTC
git commit: [SPARK-3501] [SQL] Fix the bug of Hive SimpleUDF creates unnecessary type cast

Repository: spark
Updated Branches:
  refs/heads/master fce5e251d -> 2c3cc7641


[SPARK-3501] [SQL] Fix the bug of Hive SimpleUDF creates unnecessary type cast

When do the query like:
```
select datediff(cast(value as timestamp), cast('2002-03-21 00:00:00' as timestamp)) from src;
```
SparkSQL will raise exception:
```
[info] scala.MatchError: TimestampType (of class org.apache.spark.sql.catalyst.types.TimestampType$)
[info] at org.apache.spark.sql.catalyst.expressions.Cast.castToTimestamp(Cast.scala:77)
[info] at org.apache.spark.sql.catalyst.expressions.Cast.cast$lzycompute(Cast.scala:251)
[info] at org.apache.spark.sql.catalyst.expressions.Cast.cast(Cast.scala:247)
[info] at org.apache.spark.sql.catalyst.expressions.Cast.eval(Cast.scala:263)
[info] at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$5$$anonfun$applyOrElse$2.applyOrElse(Optimizer.scala:217)
[info] at org.apache.spark.sql.catalyst.optimizer.ConstantFolding$$anonfun$apply$5$$anonfun$applyOrElse$2.applyOrElse(Optimizer.scala:210)
[info] at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:144)
[info] at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4$$anonfun$apply$2.apply(TreeNode.scala:180)
[info] at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
[info] at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
```

Author: Cheng Hao <ha...@intel.com>

Closes #2368 from chenghao-intel/cast_exception and squashes the following commits:

5c9c3a5 [Cheng Hao] make more clear code
49dfc50 [Cheng Hao] Add no-op for Cast and revert the position of SimplifyCasts
b804abd [Cheng Hao] Add unit test to show the failure in identical data type casting
330a5c8 [Cheng Hao] Update Code based on comments
b834ed4 [Cheng Hao] Fix bug of HiveSimpleUDF with unnecessary type cast which cause exception in constant folding


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2c3cc764
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2c3cc764
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2c3cc764

Branch: refs/heads/master
Commit: 2c3cc7641d86fa5196406955325a042890f77563
Parents: fce5e25
Author: Cheng Hao <ha...@intel.com>
Authored: Fri Sep 19 15:29:22 2014 -0700
Committer: Michael Armbrust <mi...@databricks.com>
Committed: Fri Sep 19 15:29:22 2014 -0700

----------------------------------------------------------------------
 .../org/apache/spark/sql/catalyst/expressions/Cast.scala |  1 +
 .../main/scala/org/apache/spark/sql/hive/hiveUdfs.scala  |  3 ++-
 ...o Timestamp in UDF-0-66952a3949d7544716fd1a675498b1fa |  1 +
 .../apache/spark/sql/hive/execution/HiveQuerySuite.scala | 11 ++++++++++-
 4 files changed, 14 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/2c3cc764/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 0ad2b30..0379275 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -245,6 +245,7 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression {
   }
 
   private[this] lazy val cast: Any => Any = dataType match {
+    case dt if dt == child.dataType => identity[Any]
     case StringType => castToString
     case BinaryType => castToBinary
     case DecimalType => castToDecimal

http://git-wip-us.apache.org/repos/asf/spark/blob/2c3cc764/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
index 7d1ad53..7cda0dd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/hiveUdfs.scala
@@ -51,12 +51,13 @@ private[hive] abstract class HiveFunctionRegistry
       val function = functionInfo.getFunctionClass.newInstance().asInstanceOf[UDF]
       val method = function.getResolver.getEvalMethod(children.map(_.dataType.toTypeInfo))
 
-      lazy val expectedDataTypes = method.getParameterTypes.map(javaClassToDataType)
+      val expectedDataTypes = method.getParameterTypes.map(javaClassToDataType)
 
       HiveSimpleUdf(
         functionClassName,
         children.zip(expectedDataTypes).map {
           case (e, NullType) => e
+          case (e, t) if (e.dataType == t) => e
           case (e, t) => Cast(e, t)
         }
       )

http://git-wip-us.apache.org/repos/asf/spark/blob/2c3cc764/sql/hive/src/test/resources/golden/Cast Timestamp to Timestamp in UDF-0-66952a3949d7544716fd1a675498b1fa
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/resources/golden/Cast Timestamp to Timestamp in UDF-0-66952a3949d7544716fd1a675498b1fa b/sql/hive/src/test/resources/golden/Cast Timestamp to Timestamp in UDF-0-66952a3949d7544716fd1a675498b1fa
new file mode 100644
index 0000000..7951def
--- /dev/null
+++ b/sql/hive/src/test/resources/golden/Cast Timestamp to Timestamp in UDF-0-66952a3949d7544716fd1a675498b1fa	
@@ -0,0 +1 @@
+NULL

http://git-wip-us.apache.org/repos/asf/spark/blob/2c3cc764/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
index 8c8a8b1..56bcd95 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveQuerySuite.scala
@@ -142,16 +142,25 @@ class HiveQuerySuite extends HiveComparisonTest {
     setConf("spark.sql.dialect", "sql")
     assert(sql("SELECT 1").collect() === Array(Seq(1)))
     setConf("spark.sql.dialect", "hiveql")
-
   }
 
   test("Query expressed in HiveQL") {
     sql("FROM src SELECT key").collect()
   }
 
+  test("Query with constant folding the CAST") {
+    sql("SELECT CAST(CAST('123' AS binary) AS binary) FROM src LIMIT 1").collect()
+  }
+
   createQueryTest("Constant Folding Optimization for AVG_SUM_COUNT",
     "SELECT AVG(0), SUM(0), COUNT(null), COUNT(value) FROM src GROUP BY key")
 
+  createQueryTest("Cast Timestamp to Timestamp in UDF",
+    """
+       | SELECT DATEDIFF(CAST(value AS timestamp), CAST('2002-03-21 00:00:00' AS timestamp)) 
+       | FROM src LIMIT 1
+    """.stripMargin)
+
   createQueryTest("Simple Average",
     "SELECT AVG(key) FROM src")
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org