You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2019/07/11 01:12:23 UTC
[spark] branch master updated: [SPARK-28015][SQL] Check
stringToDate() consumes entire input for the yyyy and yyyy-[m]m formats
This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 6532153 [SPARK-28015][SQL] Check stringToDate() consumes entire input for the yyyy and yyyy-[m]m formats
6532153 is described below
commit 653215377adfd1ff873a15175b2c44cbbf6df7a9
Author: Maxim Gekk <ma...@databricks.com>
AuthorDate: Wed Jul 10 18:12:03 2019 -0700
[SPARK-28015][SQL] Check stringToDate() consumes entire input for the yyyy and yyyy-[m]m formats
## What changes were proposed in this pull request?
Fix `stringToDate()` for the formats `yyyy` and `yyyy-[m]m` that assumes there are no additional chars after the last components `yyyy` and `[m]m`. In the PR, I propose to check that entire input was consumed for the formats.
After the fix, the input `1999 08 01` will be invalid because it matches to the pattern `yyyy` but the strings contains additional chars ` 08 01`.
Since Spark 1.6.3 ~ 2.4.3, the behavior is the same.
```
spark-sql> SELECT CAST('1999 08 01' AS DATE);
1999-01-01
```
This PR makes it return NULL like Hive.
```
spark-sql> SELECT CAST('1999 08 01' AS DATE);
NULL
```
## How was this patch tested?
Added new checks to `DateTimeUtilsSuite` for the `1999 08 01` and `1999 08` inputs.
Closes #25097 from MaxGekk/spark-28015-invalid-date-format.
Authored-by: Maxim Gekk <ma...@databricks.com>
Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
.../spark/sql/catalyst/util/DateTimeUtils.scala | 4 +
.../sql/catalyst/util/DateTimeUtilsSuite.scala | 6 ++
.../resources/sql-tests/results/pgSQL/date.sql.out | 120 ++++++++++++++++-----
3 files changed, 106 insertions(+), 24 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 50fa6fb..63e778a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -406,6 +406,10 @@ object DateTimeUtils {
// year should have exact four digits
return None
}
+ if (i < 2 && j < bytes.length) {
+ // For the `yyyy` and `yyyy-[m]m` formats, entire input must be consumed.
+ return None
+ }
segments(i) = currentSegmentValue
try {
val localDate = LocalDate.of(segments(0), segments(1), segments(2))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 2de9c41..c77c9ae 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -138,6 +138,9 @@ class DateTimeUtilsSuite extends SparkFunSuite {
assert(stringToDate(UTF8String.fromString("015-03-18")).isEmpty)
assert(stringToDate(UTF8String.fromString("015")).isEmpty)
assert(stringToDate(UTF8String.fromString("02015")).isEmpty)
+ assert(stringToDate(UTF8String.fromString("1999 08 01")).isEmpty)
+ assert(stringToDate(UTF8String.fromString("1999-08 01")).isEmpty)
+ assert(stringToDate(UTF8String.fromString("1999 08")).isEmpty)
}
test("string to timestamp") {
@@ -242,6 +245,9 @@ class DateTimeUtilsSuite extends SparkFunSuite {
checkStringToTimestamp("2015-03-18T12:03.17-20:0", None)
checkStringToTimestamp("2015-03-18T12:03.17-0:70", None)
checkStringToTimestamp("2015-03-18T12:03.17-1:0:0", None)
+ checkStringToTimestamp("1999 08 01", None)
+ checkStringToTimestamp("1999-08 01", None)
+ checkStringToTimestamp("1999 08", None)
// Truncating the fractional seconds
timeZone = TimeZone.getTimeZone("GMT+00:00")
diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/date.sql.out b/sql/core/src/test/resources/sql-tests/results/pgSQL/date.sql.out
index a0630b9..0d669ae 100644
--- a/sql/core/src/test/resources/sql-tests/results/pgSQL/date.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/pgSQL/date.sql.out
@@ -198,17 +198,29 @@ struct<DATE '1999-01-18':date>
-- !query 21
SELECT date '1999 Jan 08'
-- !query 21 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 21 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 Jan 08'
+-------^^^
-- !query 22
SELECT date '1999 08 Jan'
-- !query 22 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 22 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 Jan'
+-------^^^
-- !query 23
@@ -230,17 +242,29 @@ struct<DATE '1999-08-01':date>
-- !query 25
SELECT date '1999 01 08'
-- !query 25 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 25 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 01 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 01 08'
+-------^^^
-- !query 26
SELECT date '1999 08 01'
-- !query 26 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 26 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 01(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 01'
+-------^^^
-- !query 27
@@ -254,17 +278,29 @@ struct<DATE '1999-01-08':date>
-- !query 28
SELECT date '1999 Jan 08'
-- !query 28 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 28 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 Jan 08'
+-------^^^
-- !query 29
SELECT date '1999 08 Jan'
-- !query 29 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 29 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 Jan'
+-------^^^
-- !query 30
@@ -286,17 +322,29 @@ struct<DATE '1999-08-01':date>
-- !query 32
SELECT date '1999 01 08'
-- !query 32 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 32 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 01 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 01 08'
+-------^^^
-- !query 33
SELECT date '1999 08 01'
-- !query 33 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 33 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 01(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 01'
+-------^^^
-- !query 34
@@ -318,17 +366,29 @@ struct<DATE '1999-01-18':date>
-- !query 36
SELECT date '1999 Jan 08'
-- !query 36 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 36 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 Jan 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 Jan 08'
+-------^^^
-- !query 37
SELECT date '1999 08 Jan'
-- !query 37 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 37 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 Jan(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 Jan'
+-------^^^
-- !query 38
@@ -350,17 +410,29 @@ struct<DATE '1999-08-01':date>
-- !query 40
SELECT date '1999 01 08'
-- !query 40 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 40 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 01 08(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 01 08'
+-------^^^
-- !query 41
SELECT date '1999 08 01'
-- !query 41 schema
-struct<DATE '1999-01-01':date>
+struct<>
-- !query 41 output
-1999-01-01
+org.apache.spark.sql.catalyst.parser.ParseException
+
+Cannot parse the DATE value: 1999 08 01(line 1, pos 7)
+
+== SQL ==
+SELECT date '1999 08 01'
+-------^^^
-- !query 42
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org