You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2019/07/11 01:12:23 UTC

[spark] branch master updated: [SPARK-28015][SQL] Check stringToDate() consumes entire input for the yyyy and yyyy-[m]m formats

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 6532153  [SPARK-28015][SQL] Check stringToDate() consumes entire input for the yyyy and yyyy-[m]m formats
6532153 is described below

commit 653215377adfd1ff873a15175b2c44cbbf6df7a9
Author: Maxim Gekk <ma...@databricks.com>
AuthorDate: Wed Jul 10 18:12:03 2019 -0700

    [SPARK-28015][SQL] Check stringToDate() consumes entire input for the yyyy and yyyy-[m]m formats
    
    ## What changes were proposed in this pull request?
    
    Fix `stringToDate()` for the formats `yyyy` and `yyyy-[m]m` that assumes there are no additional chars after the last components `yyyy` and `[m]m`. In the PR, I propose to check that entire input was consumed for the formats.
    
    After the fix, the input `1999 08 01` will be invalid because it matches to the pattern `yyyy` but the strings contains additional chars ` 08 01`.
    
    Since Spark 1.6.3 ~ 2.4.3, the behavior is the same.
    ```
    spark-sql> SELECT CAST('1999 08 01' AS DATE);
    1999-01-01
    ```
    
    This PR makes it return NULL like Hive.
    ```
    spark-sql> SELECT CAST('1999 08 01' AS DATE);
    NULL
    ```
    
    ## How was this patch tested?
    
    Added new checks to `DateTimeUtilsSuite` for the `1999 08 01` and `1999 08` inputs.
    
    Closes #25097 from MaxGekk/spark-28015-invalid-date-format.
    
    Authored-by: Maxim Gekk <ma...@databricks.com>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../spark/sql/catalyst/util/DateTimeUtils.scala    |   4 +
 .../sql/catalyst/util/DateTimeUtilsSuite.scala     |   6 ++
 .../resources/sql-tests/results/pgSQL/date.sql.out | 120 ++++++++++++++++-----
 3 files changed, 106 insertions(+), 24 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 50fa6fb..63e778a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -406,6 +406,10 @@ object DateTimeUtils {
       // year should have exact four digits
       return None
     }
+    if (i < 2 && j < bytes.length) {
+      // For the `yyyy` and `yyyy-[m]m` formats, entire input must be consumed.
+      return None
+    }
     segments(i) = currentSegmentValue
     try {
       val localDate = LocalDate.of(segments(0), segments(1), segments(2))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 2de9c41..c77c9ae 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -138,6 +138,9 @@ class DateTimeUtilsSuite extends SparkFunSuite {
     assert(stringToDate(UTF8String.fromString("015-03-18")).isEmpty)
     assert(stringToDate(UTF8String.fromString("015")).isEmpty)
     assert(stringToDate(UTF8String.fromString("02015")).isEmpty)
+    assert(stringToDate(UTF8String.fromString("1999 08 01")).isEmpty)
+    assert(stringToDate(UTF8String.fromString("1999-08 01")).isEmpty)
+    assert(stringToDate(UTF8String.fromString("1999 08")).isEmpty)
   }
 
   test("string to timestamp") {
@@ -242,6 +245,9 @@ class DateTimeUtilsSuite extends SparkFunSuite {
       checkStringToTimestamp("2015-03-18T12:03.17-20:0", None)
       checkStringToTimestamp("2015-03-18T12:03.17-0:70", None)
       checkStringToTimestamp("2015-03-18T12:03.17-1:0:0", None)
+      checkStringToTimestamp("1999 08 01", None)
+      checkStringToTimestamp("1999-08 01", None)
+      checkStringToTimestamp("1999 08", None)
 
       // Truncating the fractional seconds
       timeZone = TimeZone.getTimeZone("GMT+00:00")
diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/date.sql.out b/sql/core/src/test/resources/sql-tests/results/pgSQL/date.sql.out
index a0630b9..0d669ae 100644
--- a/sql/core/src/test/resources/sql-tests/results/pgSQL/date.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/pgSQL/date.sql.out
@@ -198,17 +198,29 @@ struct<DATE '1999-01-18':date>
 -- !query 21
 SELECT date '1999 Jan 08'
 -- !query 21 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 21 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 Jan 08'
+-------^^^
 
 
 -- !query 22
 SELECT date '1999 08 Jan'
 -- !query 22 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 22 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 Jan'
+-------^^^
 
 
 -- !query 23
@@ -230,17 +242,29 @@ struct<DATE '1999-08-01':date>
 -- !query 25
 SELECT date '1999 01 08'
 -- !query 25 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 25 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 01 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 01 08'
+-------^^^
 
 
 -- !query 26
 SELECT date '1999 08 01'
 -- !query 26 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 26 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 01(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 01'
+-------^^^
 
 
 -- !query 27
@@ -254,17 +278,29 @@ struct<DATE '1999-01-08':date>
 -- !query 28
 SELECT date '1999 Jan 08'
 -- !query 28 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 28 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 Jan 08'
+-------^^^
 
 
 -- !query 29
 SELECT date '1999 08 Jan'
 -- !query 29 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 29 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 Jan'
+-------^^^
 
 
 -- !query 30
@@ -286,17 +322,29 @@ struct<DATE '1999-08-01':date>
 -- !query 32
 SELECT date '1999 01 08'
 -- !query 32 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 32 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 01 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 01 08'
+-------^^^
 
 
 -- !query 33
 SELECT date '1999 08 01'
 -- !query 33 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 33 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 01(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 01'
+-------^^^
 
 
 -- !query 34
@@ -318,17 +366,29 @@ struct<DATE '1999-01-18':date>
 -- !query 36
 SELECT date '1999 Jan 08'
 -- !query 36 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 36 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 Jan 08'
+-------^^^
 
 
 -- !query 37
 SELECT date '1999 08 Jan'
 -- !query 37 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 37 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 Jan'
+-------^^^
 
 
 -- !query 38
@@ -350,17 +410,29 @@ struct<DATE '1999-08-01':date>
 -- !query 40
 SELECT date '1999 01 08'
 -- !query 40 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 40 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 01 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 01 08'
+-------^^^
 
 
 -- !query 41
 SELECT date '1999 08 01'
 -- !query 41 schema
-struct<DATE '1999-01-01':date>
+struct<>
 -- !query 41 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 01(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 01'
+-------^^^
 
 
 -- !query 42


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org