You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by hv...@apache.org on 2017/02/13 11:25:17 UTC

spark git commit: [SPARK-19496][SQL] to_date udf to return null when input date is invalid

Repository: spark
Updated Branches:
  refs/heads/master 8f03ad547 -> 04ad82253


[SPARK-19496][SQL] to_date udf to return null when input date is invalid

## What changes were proposed in this pull request?

Currently the udf  `to_date` has different return value with an invalid date input.

```
SELECT to_date('2015-07-22', 'yyyy-dd-MM') ->  return `2016-10-07`
SELECT to_date('2014-31-12')    -> return null
```

As discussed in JIRA [SPARK-19496](https://issues.apache.org/jira/browse/SPARK-19496), we should return null in both situations when the input date is invalid

## How was this patch tested?
unit test added

Author: windpiger <so...@outlook.com>

Closes #16870 from windpiger/to_date.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/04ad8225
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/04ad8225
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/04ad8225

Branch: refs/heads/master
Commit: 04ad822534e8ded96a9ba4b7d43320e53c6d2808
Parents: 8f03ad5
Author: windpiger <so...@outlook.com>
Authored: Mon Feb 13 12:25:13 2017 +0100
Committer: Herman van Hovell <hv...@databricks.com>
Committed: Mon Feb 13 12:25:13 2017 +0100

----------------------------------------------------------------------
 .../spark/sql/catalyst/util/DateTimeUtils.scala |  4 ++
 .../apache/spark/sql/DateFunctionsSuite.scala   | 75 ++++++++++++++++++--
 2 files changed, 75 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/04ad8225/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
----------------------------------------------------------------------
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index af70efb..9e1de0f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -98,6 +98,10 @@ object DateTimeUtils {
   def newDateFormat(formatString: String, timeZone: TimeZone): DateFormat = {
     val sdf = new SimpleDateFormat(formatString, Locale.US)
     sdf.setTimeZone(timeZone)
+    // Enable strict parsing, if the input date/format is invalid, it will throw an exception.
+    // e.g. to parse invalid date '2016-13-12', or '2016-01-12' with  invalid format 'yyyy-aa-dd',
+    // an exception will be throwed.
+    sdf.setLenient(false)
     sdf
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/04ad8225/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index 618db43..2acda3f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -398,14 +398,27 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
         Row(Date.valueOf("2014-12-31"))))
     checkAnswer(
       df.select(to_date(col("s"), "yyyy-MM-dd")),
-      Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")),
-        Row(Date.valueOf("2016-07-12"))))
+      Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null)))
 
     //  now switch format
     checkAnswer(
       df.select(to_date(col("s"), "yyyy-dd-MM")),
-      Seq(Row(Date.valueOf("2016-10-07")), Row(Date.valueOf("2016-07-12")),
-        Row(Date.valueOf("2014-12-31"))))
+      Seq(Row(null), Row(null), Row(Date.valueOf("2014-12-31"))))
+
+    // invalid format
+    checkAnswer(
+      df.select(to_date(col("s"), "yyyy-hh-MM")),
+      Seq(Row(null), Row(null), Row(null)))
+    checkAnswer(
+      df.select(to_date(col("s"), "yyyy-dd-aa")),
+      Seq(Row(null), Row(null), Row(null)))
+
+    // february
+    val x1 = "2016-02-29"
+    val x2 = "2017-02-29"
+    val df1 = Seq(x1, x2).toDF("x")
+    checkAnswer(
+      df1.select(to_date(col("x"))), Row(Date.valueOf("2016-02-29")) :: Row(null) :: Nil)
   }
 
   test("function trunc") {
@@ -477,6 +490,35 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
     checkAnswer(df.selectExpr(s"unix_timestamp(s, '$fmt')"), Seq(
       Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L)))
 
+    val x1 = "2015-07-24 10:00:00"
+    val x2 = "2015-25-07 02:02:02"
+    val x3 = "2015-07-24 25:02:02"
+    val x4 = "2015-24-07 26:02:02"
+    val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
+    val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
+
+    val df1 = Seq(x1, x2, x3, x4).toDF("x")
+    checkAnswer(df1.select(unix_timestamp(col("x"))), Seq(
+      Row(ts1.getTime / 1000L), Row(null), Row(null), Row(null)))
+    checkAnswer(df1.selectExpr("unix_timestamp(x)"), Seq(
+      Row(ts1.getTime / 1000L), Row(null), Row(null), Row(null)))
+    checkAnswer(df1.select(unix_timestamp(col("x"), "yyyy-dd-MM HH:mm:ss")), Seq(
+      Row(null), Row(ts2.getTime / 1000L), Row(null), Row(null)))
+    checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq(
+      Row(ts4.getTime / 1000L), Row(null), Row(ts3.getTime / 1000L), Row(null)))
+
+    // invalid format
+    checkAnswer(df1.selectExpr(s"unix_timestamp(x, 'yyyy-MM-dd aa:HH:ss')"), Seq(
+      Row(null), Row(null), Row(null), Row(null)))
+
+    // february
+    val y1 = "2016-02-29"
+    val y2 = "2017-02-29"
+    val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
+    val df2 = Seq(y1, y2).toDF("y")
+    checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
+      Row(ts5.getTime / 1000L), Row(null)))
+
     val now = sql("select unix_timestamp()").collect().head.getLong(0)
     checkAnswer(sql(s"select cast ($now as timestamp)"), Row(new java.util.Date(now * 1000)))
   }
@@ -500,6 +542,31 @@ class DateFunctionsSuite extends QueryTest with SharedSQLContext {
       Row(date1.getTime / 1000L), Row(date2.getTime / 1000L)))
     checkAnswer(df.selectExpr(s"to_unix_timestamp(s, '$fmt')"), Seq(
       Row(ts1.getTime / 1000L), Row(ts2.getTime / 1000L)))
+
+    val x1 = "2015-07-24 10:00:00"
+    val x2 = "2015-25-07 02:02:02"
+    val x3 = "2015-07-24 25:02:02"
+    val x4 = "2015-24-07 26:02:02"
+    val ts3 = Timestamp.valueOf("2015-07-24 02:25:02")
+    val ts4 = Timestamp.valueOf("2015-07-24 00:10:00")
+
+    val df1 = Seq(x1, x2, x3, x4).toDF("x")
+    checkAnswer(df1.selectExpr("to_unix_timestamp(x)"), Seq(
+      Row(ts1.getTime / 1000L), Row(null), Row(null), Row(null)))
+    checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd mm:HH:ss')"), Seq(
+      Row(ts4.getTime / 1000L), Row(null), Row(ts3.getTime / 1000L), Row(null)))
+
+    // february
+    val y1 = "2016-02-29"
+    val y2 = "2017-02-29"
+    val ts5 = Timestamp.valueOf("2016-02-29 00:00:00")
+    val df2 = Seq(y1, y2).toDF("y")
+    checkAnswer(df2.select(unix_timestamp(col("y"), "yyyy-MM-dd")), Seq(
+      Row(ts5.getTime / 1000L), Row(null)))
+
+    // invalid format
+    checkAnswer(df1.selectExpr(s"to_unix_timestamp(x, 'yyyy-MM-dd bb:HH:ss')"), Seq(
+      Row(null), Row(null), Row(null), Row(null)))
   }
 
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org