You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2020/05/24 05:29:23 UTC
[spark] branch branch-3.0 updated: [SPARK-31755][SQL][3.0] allow
missing year/hour when parsing date/timestamp string
This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 576c224 [SPARK-31755][SQL][3.0] allow missing year/hour when parsing date/timestamp string
576c224 is described below
commit 576c22442c48d26780ca4e335a981c3215e09103
Author: Wenchen Fan <we...@databricks.com>
AuthorDate: Sun May 24 14:27:16 2020 +0900
[SPARK-31755][SQL][3.0] allow missing year/hour when parsing date/timestamp string
### What changes were proposed in this pull request?
This PR allows missing year and hour fields when parsing date/timestamp string, with 1970 as the default year and 0 as the default hour.
### Why are the changes needed?
To keep backward compatibility with Spark 2.4.
### Does this PR introduce _any_ user-facing change?
Yes.
Spark 2.4:
```
scala> sql("select to_timestamp('16', 'dd')").show
+------------------------+
|to_timestamp('16', 'dd')|
+------------------------+
| 1970-01-16 00:00:00|
+------------------------+
scala> sql("select to_date('16', 'dd')").show
+-------------------+
|to_date('16', 'dd')|
+-------------------+
| 1970-01-16|
+-------------------+
scala> sql("select to_timestamp('2019 40', 'yyyy mm')").show
+----------------------------------+
|to_timestamp('2019 40', 'yyyy mm')|
+----------------------------------+
| 2019-01-01 00:40:00|
+----------------------------------+
scala> sql("select to_timestamp('2019 10:10:10', 'yyyy hh:mm:ss')").show
+----------------------------------------------+
|to_timestamp('2019 10:10:10', 'yyyy hh:mm:ss')|
+----------------------------------------------+
| 2019-01-01 10:10:10|
+----------------------------------------------+
```
in branch 3.0
```
scala> sql("select to_timestamp('16', 'dd')").show
+--------------------+
|to_timestamp(16, dd)|
+--------------------+
| null|
+--------------------+
scala> sql("select to_date('16', 'dd')").show
+---------------+
|to_date(16, dd)|
+---------------+
| null|
+---------------+
scala> sql("select to_timestamp('2019 40', 'yyyy mm')").show
+------------------------------+
|to_timestamp(2019 40, yyyy mm)|
+------------------------------+
| 2019-01-01 00:00:00|
+------------------------------+
scala> sql("select to_timestamp('2019 10:10:10', 'yyyy hh:mm:ss')").show
+------------------------------------------+
|to_timestamp(2019 10:10:10, yyyy hh:mm:ss)|
+------------------------------------------+
| 2019-01-01 00:00:00|
+------------------------------------------+
```
After this PR, the behavior becomes the same as 2.4.
### How was this patch tested?
new tests
Closes #28612 from cloud-fan/backport.
Authored-by: Wenchen Fan <we...@databricks.com>
Signed-off-by: HyukjinKwon <gu...@apache.org>
---
.../spark/sql/catalyst/util/DateFormatter.scala | 2 +-
.../catalyst/util/DateTimeFormatterHelper.scala | 77 ++++++++----
.../sql/catalyst/csv/UnivocityParserSuite.scala | 14 +--
.../sql/catalyst/util/DateTimeTestUtils.scala | 8 +-
.../sql/catalyst/util/DateTimeUtilsSuite.scala | 4 +-
.../apache/spark/sql/util/DateFormatterSuite.scala | 46 ++++---
.../spark/sql/util/TimestampFormatterSuite.scala | 139 ++++++++++++++++-----
.../test/resources/sql-tests/inputs/datetime.sql | 12 +-
.../resources/sql-tests/inputs/json-functions.sql | 15 +++
.../sql-tests/results/ansi/datetime.sql.out | 50 +++++++-
.../resources/sql-tests/results/datetime.sql.out | 50 +++++++-
.../sql-tests/results/json-functions.sql.out | 45 ++++++-
12 files changed, 370 insertions(+), 92 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
index 7d94955..8261f57 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
@@ -53,7 +53,7 @@ class Iso8601DateFormatter(
val specialDate = convertSpecialDate(s.trim, zoneId)
specialDate.getOrElse {
try {
- val localDate = LocalDate.parse(s, formatter)
+ val localDate = toLocalDate(formatter.parse(s))
localDateToDays(localDate)
} catch checkDiffResult(s, legacyFormatter.parse)
}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala
index 05ec23f..35f95db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeFormatterHelper.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util
import java.time._
import java.time.chrono.IsoChronology
-import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder, DateTimeParseException, ResolverStyle}
+import java.time.format.{DateTimeFormatter, DateTimeFormatterBuilder, ResolverStyle}
import java.time.temporal.{ChronoField, TemporalAccessor, TemporalQueries}
import java.util.Locale
@@ -31,17 +31,52 @@ import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy._
trait DateTimeFormatterHelper {
+ private def getOrDefault(accessor: TemporalAccessor, field: ChronoField, default: Int): Int = {
+ if (accessor.isSupported(field)) {
+ accessor.get(field)
+ } else {
+ default
+ }
+ }
+
+ protected def toLocalDate(accessor: TemporalAccessor): LocalDate = {
+ val localDate = accessor.query(TemporalQueries.localDate())
+ // If all the date fields are specified, return the local date directly.
+ if (localDate != null) return localDate
+
+ // Users may want to parse only a few datetime fields from a string and extract these fields
+ // later, and we should provide default values for missing fields.
+ // To be compatible with Spark 2.4, we pick 1970 as the default value of year.
+ val year = getOrDefault(accessor, ChronoField.YEAR, 1970)
+ val month = getOrDefault(accessor, ChronoField.MONTH_OF_YEAR, 1)
+ val day = getOrDefault(accessor, ChronoField.DAY_OF_MONTH, 1)
+ LocalDate.of(year, month, day)
+ }
+
+ private def toLocalTime(accessor: TemporalAccessor): LocalTime = {
+ val localTime = accessor.query(TemporalQueries.localTime())
+ // If all the time fields are specified, return the local time directly.
+ if (localTime != null) return localTime
+
+ val hour = if (accessor.isSupported(ChronoField.HOUR_OF_DAY)) {
+ accessor.get(ChronoField.HOUR_OF_DAY)
+ } else if (accessor.isSupported(ChronoField.HOUR_OF_AMPM)) {
+ // When we reach here, it means am/pm is not specified. Here we assume it's am.
+ accessor.get(ChronoField.HOUR_OF_AMPM)
+ } else {
+ 0
+ }
+ val minute = getOrDefault(accessor, ChronoField.MINUTE_OF_HOUR, 0)
+ val second = getOrDefault(accessor, ChronoField.SECOND_OF_MINUTE, 0)
+ val nanoSecond = getOrDefault(accessor, ChronoField.NANO_OF_SECOND, 0)
+ LocalTime.of(hour, minute, second, nanoSecond)
+ }
+
// Converts the parsed temporal object to ZonedDateTime. It sets time components to zeros
// if they does not exist in the parsed object.
- protected def toZonedDateTime(
- temporalAccessor: TemporalAccessor,
- zoneId: ZoneId): ZonedDateTime = {
- // Parsed input might not have time related part. In that case, time component is set to zeros.
- val parsedLocalTime = temporalAccessor.query(TemporalQueries.localTime)
- val localTime = if (parsedLocalTime == null) LocalTime.MIDNIGHT else parsedLocalTime
- // Parsed input must have date component. At least, year must present in temporalAccessor.
- val localDate = temporalAccessor.query(TemporalQueries.localDate)
-
+ protected def toZonedDateTime(accessor: TemporalAccessor, zoneId: ZoneId): ZonedDateTime = {
+ val localDate = toLocalDate(accessor)
+ val localTime = toLocalTime(accessor)
ZonedDateTime.of(localDate, localTime, zoneId)
}
@@ -72,19 +107,15 @@ trait DateTimeFormatterHelper {
// DateTimeParseException will address by the caller side.
protected def checkDiffResult[T](
s: String, legacyParseFunc: String => T): PartialFunction[Throwable, T] = {
- case e: DateTimeParseException if SQLConf.get.legacyTimeParserPolicy == EXCEPTION =>
- val res = try {
- Some(legacyParseFunc(s))
+ case e: DateTimeException if SQLConf.get.legacyTimeParserPolicy == EXCEPTION =>
+ try {
+ legacyParseFunc(s)
} catch {
- case _: Throwable => None
- }
- if (res.nonEmpty) {
- throw new SparkUpgradeException("3.0", s"Fail to parse '$s' in the new parser. You can " +
- s"set ${SQLConf.LEGACY_TIME_PARSER_POLICY.key} to LEGACY to restore the behavior " +
- s"before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.", e)
- } else {
- throw e
+ case _: Throwable => throw e
}
+ throw new SparkUpgradeException("3.0", s"Fail to parse '$s' in the new parser. You can " +
+ s"set ${SQLConf.LEGACY_TIME_PARSER_POLICY.key} to LEGACY to restore the behavior " +
+ s"before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.", e)
}
}
@@ -101,10 +132,6 @@ private object DateTimeFormatterHelper {
def toFormatter(builder: DateTimeFormatterBuilder, locale: Locale): DateTimeFormatter = {
builder
- .parseDefaulting(ChronoField.MONTH_OF_YEAR, 1)
- .parseDefaulting(ChronoField.DAY_OF_MONTH, 1)
- .parseDefaulting(ChronoField.MINUTE_OF_HOUR, 0)
- .parseDefaulting(ChronoField.SECOND_OF_MINUTE, 0)
.toFormatter(locale)
.withChronology(IsoChronology.INSTANCE)
.withResolverStyle(ResolverStyle.STRICT)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
index 4853b4f..474bb53 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/csv/UnivocityParserSuite.scala
@@ -325,7 +325,7 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
assert(parser.makeConverter("t", TimestampType).apply("2020-1-12 12:3:45") ==
date(2020, 1, 12, 12, 3, 45, 0))
assert(parser.makeConverter("t", DateType).apply("2020-1-12") ==
- days(2020, 1, 12, 0, 0, 0))
+ days(2020, 1, 12))
// The legacy format allows arbitrary length of second fraction.
assert(parser.makeConverter("t", TimestampType).apply("2020-1-12 12:3:45.1") ==
date(2020, 1, 12, 12, 3, 45, 100000))
@@ -333,22 +333,22 @@ class UnivocityParserSuite extends SparkFunSuite with SQLHelper {
date(2020, 1, 12, 12, 3, 45, 123400))
// The legacy format allow date string to end with T or space, with arbitrary string
assert(parser.makeConverter("t", DateType).apply("2020-1-12T") ==
- days(2020, 1, 12, 0, 0, 0))
+ days(2020, 1, 12))
assert(parser.makeConverter("t", DateType).apply("2020-1-12Txyz") ==
- days(2020, 1, 12, 0, 0, 0))
+ days(2020, 1, 12))
assert(parser.makeConverter("t", DateType).apply("2020-1-12 ") ==
- days(2020, 1, 12, 0, 0, 0))
+ days(2020, 1, 12))
assert(parser.makeConverter("t", DateType).apply("2020-1-12 xyz") ==
- days(2020, 1, 12, 0, 0, 0))
+ days(2020, 1, 12))
// The legacy format ignores the "GMT" from the string
assert(parser.makeConverter("t", TimestampType).apply("2020-1-12 12:3:45GMT") ==
date(2020, 1, 12, 12, 3, 45, 0))
assert(parser.makeConverter("t", TimestampType).apply("GMT2020-1-12 12:3:45") ==
date(2020, 1, 12, 12, 3, 45, 0))
assert(parser.makeConverter("t", DateType).apply("2020-1-12GMT") ==
- days(2020, 1, 12, 0, 0, 0))
+ days(2020, 1, 12))
assert(parser.makeConverter("t", DateType).apply("GMT2020-1-12") ==
- days(2020, 1, 12, 0, 0, 0))
+ days(2020, 1, 12))
}
val options = new CSVOptions(Map.empty[String, String], false, "UTC")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala
index bf9e8f7..66aef1b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeTestUtils.scala
@@ -88,12 +88,8 @@ object DateTimeTestUtils {
def days(
year: Int,
month: Byte = 1,
- day: Byte = 1,
- hour: Byte = 0,
- minute: Byte = 0,
- sec: Byte = 0): Int = {
- val micros = date(year, month, day, hour, minute, sec)
- TimeUnit.MICROSECONDS.toDays(micros).toInt
+ day: Byte = 1): Int = {
+ LocalDate.of(year, month, day).toEpochDay.toInt
}
// Returns microseconds since epoch for current date and give time
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index b547c44..d526ae1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -386,13 +386,13 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
}
test("date add months") {
- val input = days(1997, 2, 28, 10, 30)
+ val input = days(1997, 2, 28)
assert(dateAddMonths(input, 36) === days(2000, 2, 28))
assert(dateAddMonths(input, -13) === days(1996, 1, 28))
}
test("date add interval with day precision") {
- val input = days(1997, 2, 28, 10, 30)
+ val input = days(1997, 2, 28)
assert(dateAddInterval(input, new CalendarInterval(36, 0, 0)) === days(2000, 2, 28))
assert(dateAddInterval(input, new CalendarInterval(36, 47, 0)) === days(2000, 4, 15))
assert(dateAddInterval(input, new CalendarInterval(-13, 0, 0)) === days(1996, 1, 28))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala
index 3954b9b..7d503cc 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/DateFormatterSuite.scala
@@ -17,18 +17,19 @@
package org.apache.spark.sql.util
-import java.time.{DateTimeException, LocalDate, ZoneOffset}
+import java.time.{DateTimeException, LocalDate}
import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
import org.apache.spark.sql.catalyst.plans.SQLHelper
-import org.apache.spark.sql.catalyst.util._
+import org.apache.spark.sql.catalyst.util.{DateFormatter, LegacyDateFormats}
+import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
import org.apache.spark.sql.catalyst.util.DateTimeUtils._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
class DateFormatterSuite extends SparkFunSuite with SQLHelper {
test("parsing dates") {
- DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
+ outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
val formatter = DateFormatter(getZoneId(timeZone))
val daysSinceEpoch = formatter.parse("2018-12-02")
@@ -38,7 +39,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
}
test("format dates") {
- DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
+ outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
val formatter = DateFormatter(getZoneId(timeZone))
val (days, expected) = (17867, "2018-12-02")
@@ -65,7 +66,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
"2018-12-12",
"2038-01-01",
"5010-11-17").foreach { date =>
- DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
+ outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
val formatter = DateFormatter(
DateFormatter.defaultPattern,
@@ -99,7 +100,7 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
17877,
24837,
1110657).foreach { days =>
- DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
+ outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
val formatter = DateFormatter(
DateFormatter.defaultPattern,
@@ -118,14 +119,14 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
}
test("parsing date without explicit day") {
- val formatter = DateFormatter("yyyy MMM", ZoneOffset.UTC)
+ val formatter = DateFormatter("yyyy MMM", UTC)
val daysSinceEpoch = formatter.parse("2018 Dec")
- assert(daysSinceEpoch === LocalDate.of(2018, 12, 1).toEpochDay)
+ assert(daysSinceEpoch === days(2018, 12, 1))
}
test("formatting negative years with default pattern") {
- val epochDays = LocalDate.of(-99, 1, 1).toEpochDay.toInt
- assert(DateFormatter(ZoneOffset.UTC).format(epochDays) === "-0099-01-01")
+ val epochDays = days(-99, 1, 1)
+ assert(DateFormatter(UTC).format(epochDays) === "-0099-01-01")
}
test("special date values") {
@@ -142,8 +143,8 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
}
test("SPARK-30958: parse date with negative year") {
- val formatter1 = DateFormatter("yyyy-MM-dd", ZoneOffset.UTC)
- assert(formatter1.parse("-1234-02-22") === localDateToDays(LocalDate.of(-1234, 2, 22)))
+ val formatter1 = DateFormatter("yyyy-MM-dd", UTC)
+ assert(formatter1.parse("-1234-02-22") === days(-1234, 2, 22))
def assertParsingError(f: => Unit): Unit = {
intercept[Exception](f) match {
@@ -155,18 +156,18 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
}
// "yyyy" with "G" can't parse negative year or year 0000.
- val formatter2 = DateFormatter("G yyyy-MM-dd", ZoneOffset.UTC)
+ val formatter2 = DateFormatter("G yyyy-MM-dd", UTC)
assertParsingError(formatter2.parse("BC -1234-02-22"))
assertParsingError(formatter2.parse("AD 0000-02-22"))
- assert(formatter2.parse("BC 1234-02-22") === localDateToDays(LocalDate.of(-1233, 2, 22)))
- assert(formatter2.parse("AD 1234-02-22") === localDateToDays(LocalDate.of(1234, 2, 22)))
+ assert(formatter2.parse("BC 1234-02-22") === days(-1233, 2, 22))
+ assert(formatter2.parse("AD 1234-02-22") === days(1234, 2, 22))
}
test("SPARK-31557: rebasing in legacy formatters/parsers") {
withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> LegacyBehaviorPolicy.LEGACY.toString) {
LegacyDateFormats.values.foreach { legacyFormat =>
- DateTimeTestUtils.outstandingTimezonesIds.foreach { timeZone =>
+ outstandingTimezonesIds.foreach { timeZone =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> timeZone) {
val formatter = DateFormatter(
DateFormatter.defaultPattern,
@@ -182,4 +183,17 @@ class DateFormatterSuite extends SparkFunSuite with SQLHelper {
}
}
}
+
+ test("missing date fields") {
+ val formatter = DateFormatter("HH", UTC)
+ val daysSinceEpoch = formatter.parse("20")
+ assert(daysSinceEpoch === days(1970, 1, 1))
+ }
+
+ test("missing year field with invalid date") {
+ val formatter = DateFormatter("MM-dd", UTC)
+ // The date parser in 2.4 accepts 1970-02-29 and turn it into 1970-03-01, so we should get a
+ // SparkUpgradeException here.
+ intercept[SparkUpgradeException](formatter.parse("02-29"))
+ }
}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala
index b467e24..dccb3de 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/util/TimestampFormatterSuite.scala
@@ -17,15 +17,15 @@
package org.apache.spark.sql.util
-import java.time.{DateTimeException, Instant, LocalDateTime, LocalTime, ZoneOffset}
+import java.time.{DateTimeException, Instant, LocalDateTime, LocalTime}
import java.util.concurrent.TimeUnit
import org.scalatest.Matchers
import org.apache.spark.{SparkFunSuite, SparkUpgradeException}
import org.apache.spark.sql.catalyst.plans.SQLHelper
-import org.apache.spark.sql.catalyst.util.{DateTimeTestUtils, DateTimeUtils, LegacyDateFormats, TimestampFormatter}
-import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{CET, PST, UTC}
+import org.apache.spark.sql.catalyst.util.{LegacyDateFormats, TimestampFormatter}
+import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._
import org.apache.spark.sql.catalyst.util.DateTimeUtils._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
@@ -44,10 +44,10 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
"Antarctica/Vostok" -> 1543723872001234L,
"Asia/Hong_Kong" -> 1543716672001234L,
"Europe/Amsterdam" -> 1543741872001234L)
- DateTimeTestUtils.outstandingTimezonesIds.foreach { zoneId =>
+ outstandingTimezonesIds.foreach { zoneId =>
val formatter = TimestampFormatter(
"yyyy-MM-dd'T'HH:mm:ss.SSSSSS",
- DateTimeUtils.getZoneId(zoneId),
+ getZoneId(zoneId),
needVarLengthSecondFraction = true)
val microsSinceEpoch = formatter.parse(localDate)
assert(microsSinceEpoch === expectedMicros(zoneId))
@@ -65,7 +65,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
"Antarctica/Vostok" -> "2018-12-02 16:11:12.001234",
"Asia/Hong_Kong" -> "2018-12-02 18:11:12.001234",
"Europe/Amsterdam" -> "2018-12-02 11:11:12.001234")
- DateTimeTestUtils.outstandingTimezonesIds.foreach { zoneId =>
+ outstandingTimezonesIds.foreach { zoneId =>
Seq(
TimestampFormatter(
"yyyy-MM-dd HH:mm:ss.SSSSSS",
@@ -95,7 +95,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
1543749753123456L,
2177456523456789L,
11858049903010203L).foreach { micros =>
- DateTimeTestUtils.outstandingZoneIds.foreach { zoneId =>
+ outstandingZoneIds.foreach { zoneId =>
val timestamp = TimestampFormatter(pattern, zoneId).format(micros)
val parsed = TimestampFormatter(
pattern, zoneId, needVarLengthSecondFraction = true).parse(timestamp)
@@ -116,7 +116,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
"2018-12-02T11:22:33.123456",
"2039-01-01T01:02:03.456789",
"2345-10-07T22:45:03.010203").foreach { timestamp =>
- DateTimeTestUtils.outstandingZoneIds.foreach { zoneId =>
+ outstandingZoneIds.foreach { zoneId =>
val pattern = "yyyy-MM-dd'T'HH:mm:ss.SSSSSS"
val micros = TimestampFormatter(
pattern, zoneId, needVarLengthSecondFraction = true).parse(timestamp)
@@ -127,10 +127,9 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
}
test("case insensitive parsing of am and pm") {
- val formatter = TimestampFormatter("yyyy MMM dd hh:mm:ss a", ZoneOffset.UTC)
+ val formatter = TimestampFormatter("yyyy MMM dd hh:mm:ss a", UTC)
val micros = formatter.parse("2009 Mar 20 11:30:01 am")
- assert(micros === TimeUnit.SECONDS.toMicros(
- LocalDateTime.of(2009, 3, 20, 11, 30, 1).toEpochSecond(ZoneOffset.UTC)))
+ assert(micros === date(2009, 3, 20, 11, 30, 1))
}
test("format fraction of second") {
@@ -143,7 +142,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
1000000 -> "1970-01-01 00:00:01").foreach { case (micros, tsStr) =>
assert(formatter.format(micros) === tsStr)
assert(formatter.format(microsToInstant(micros)) === tsStr)
- DateTimeTestUtils.withDefaultTimeZone(UTC) {
+ withDefaultTimeZone(UTC) {
assert(formatter.format(toJavaTimestamp(micros)) === tsStr)
}
}
@@ -151,10 +150,10 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
test("formatting negative years with default pattern") {
val instant = LocalDateTime.of(-99, 1, 1, 0, 0, 0).atZone(UTC).toInstant
- val micros = DateTimeUtils.instantToMicros(instant)
+ val micros = instantToMicros(instant)
assert(TimestampFormatter(UTC).format(micros) === "-0099-01-01 00:00:00")
assert(TimestampFormatter(UTC).format(instant) === "-0099-01-01 00:00:00")
- DateTimeTestUtils.withDefaultTimeZone(UTC) { // toJavaTimestamp depends on the default time zone
+ withDefaultTimeZone(UTC) { // toJavaTimestamp depends on the default time zone
assert(TimestampFormatter("yyyy-MM-dd HH:mm:SS G", UTC).format(toJavaTimestamp(micros))
=== "0100-01-01 00:00:00 BC")
}
@@ -181,11 +180,10 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
}
test("parsing timestamp strings with various seconds fractions") {
- DateTimeTestUtils.outstandingZoneIds.foreach { zoneId =>
+ outstandingZoneIds.foreach { zoneId =>
def check(pattern: String, input: String, reference: String): Unit = {
val formatter = TimestampFormatter(pattern, zoneId, needVarLengthSecondFraction = true)
- val expected = DateTimeUtils.stringToTimestamp(
- UTF8String.fromString(reference), zoneId).get
+ val expected = stringToTimestamp(UTF8String.fromString(reference), zoneId).get
val actual = formatter.parse(input)
assert(actual === expected)
}
@@ -219,11 +217,10 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
}
test("formatting timestamp strings up to microsecond precision") {
- DateTimeTestUtils.outstandingZoneIds.foreach { zoneId =>
+ outstandingZoneIds.foreach { zoneId =>
def check(pattern: String, input: String, expected: String): Unit = {
val formatter = TimestampFormatter(pattern, zoneId)
- val timestamp = DateTimeUtils.stringToTimestamp(
- UTF8String.fromString(input), zoneId).get
+ val timestamp = stringToTimestamp(UTF8String.fromString(input), zoneId).get
val actual = formatter.format(timestamp)
assert(actual === expected)
}
@@ -259,9 +256,8 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
}
test("SPARK-30958: parse timestamp with negative year") {
- val formatter1 = TimestampFormatter("yyyy-MM-dd HH:mm:ss", ZoneOffset.UTC, true)
- assert(formatter1.parse("-1234-02-22 02:22:22") === instantToMicros(
- LocalDateTime.of(-1234, 2, 22, 2, 22, 22).toInstant(ZoneOffset.UTC)))
+ val formatter1 = TimestampFormatter("yyyy-MM-dd HH:mm:ss", UTC, true)
+ assert(formatter1.parse("-1234-02-22 02:22:22") === date(-1234, 2, 22, 2, 22, 22))
def assertParsingError(f: => Unit): Unit = {
intercept[Exception](f) match {
@@ -277,17 +273,15 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
assertParsingError(formatter2.parse("BC -1234-02-22 02:22:22"))
assertParsingError(formatter2.parse("AC 0000-02-22 02:22:22"))
- assert(formatter2.parse("BC 1234-02-22 02:22:22") === instantToMicros(
- LocalDateTime.of(-1233, 2, 22, 2, 22, 22).toInstant(ZoneOffset.UTC)))
- assert(formatter2.parse("AD 1234-02-22 02:22:22") === instantToMicros(
- LocalDateTime.of(1234, 2, 22, 2, 22, 22).toInstant(ZoneOffset.UTC)))
+ assert(formatter2.parse("BC 1234-02-22 02:22:22") === date(-1233, 2, 22, 2, 22, 22))
+ assert(formatter2.parse("AD 1234-02-22 02:22:22") === date(1234, 2, 22, 2, 22, 22))
}
test("SPARK-31557: rebasing in legacy formatters/parsers") {
withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> LegacyBehaviorPolicy.LEGACY.toString) {
- DateTimeTestUtils.outstandingZoneIds.foreach { zoneId =>
+ outstandingZoneIds.foreach { zoneId =>
withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> zoneId.getId) {
- DateTimeTestUtils.withDefaultTimeZone(zoneId) {
+ withDefaultTimeZone(zoneId) {
withClue(s"zoneId = ${zoneId.getId}") {
val formatters = LegacyDateFormats.values.map { legacyFormat =>
TimestampFormatter(
@@ -296,7 +290,7 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
TimestampFormatter.defaultLocale,
legacyFormat,
needVarLengthSecondFraction = false)
- }.toSeq :+ TimestampFormatter.getFractionFormatter(zoneId)
+ }.toSeq :+ TimestampFormatter.getFractionFormatter(zoneId)
formatters.foreach { formatter =>
assert(microsToInstant(formatter.parse("1000-01-01 01:02:03"))
.atZone(zoneId)
@@ -317,4 +311,89 @@ class TimestampFormatterSuite extends SparkFunSuite with SQLHelper with Matchers
}
}
}
+
+ test("parsing hour with various patterns") {
+ def createFormatter(pattern: String): TimestampFormatter = {
+ // Use `SIMPLE_DATE_FORMAT`, so that the legacy parser also fails with invalid value range.
+ TimestampFormatter(pattern, UTC, LegacyDateFormats.SIMPLE_DATE_FORMAT, false)
+ }
+
+ withClue("HH") {
+ val formatter = createFormatter("yyyy-MM-dd HH")
+
+ val micros1 = formatter.parse("2009-12-12 00")
+ assert(micros1 === date(2009, 12, 12))
+
+ val micros2 = formatter.parse("2009-12-12 15")
+ assert(micros2 === date(2009, 12, 12, 15))
+
+ intercept[DateTimeException](formatter.parse("2009-12-12 24"))
+ }
+
+ withClue("kk") {
+ val formatter = createFormatter("yyyy-MM-dd kk")
+
+ intercept[DateTimeException](formatter.parse("2009-12-12 00"))
+
+ val micros1 = formatter.parse("2009-12-12 15")
+ assert(micros1 === date(2009, 12, 12, 15))
+
+ val micros2 = formatter.parse("2009-12-12 24")
+ assert(micros2 === date(2009, 12, 12))
+ }
+
+ withClue("KK") {
+ val formatter = createFormatter("yyyy-MM-dd KK a")
+
+ val micros1 = formatter.parse("2009-12-12 00 am")
+ assert(micros1 === date(2009, 12, 12))
+
+ // For `KK`, "12:00:00 am" is the same as "00:00:00 pm".
+ val micros2 = formatter.parse("2009-12-12 12 am")
+ assert(micros2 === date(2009, 12, 12, 12))
+
+ val micros3 = formatter.parse("2009-12-12 00 pm")
+ assert(micros3 === date(2009, 12, 12, 12))
+
+ intercept[DateTimeException](formatter.parse("2009-12-12 12 pm"))
+ }
+
+ withClue("hh") {
+ val formatter = createFormatter("yyyy-MM-dd hh a")
+
+ intercept[DateTimeException](formatter.parse("2009-12-12 00 am"))
+
+ val micros1 = formatter.parse("2009-12-12 12 am")
+ assert(micros1 === date(2009, 12, 12))
+
+ intercept[DateTimeException](formatter.parse("2009-12-12 00 pm"))
+
+ val micros2 = formatter.parse("2009-12-12 12 pm")
+ assert(micros2 === date(2009, 12, 12, 12))
+ }
+ }
+
+ test("missing date fields") {
+ val formatter = TimestampFormatter("HH:mm:ss", UTC)
+ val micros = formatter.parse("11:30:01")
+ assert(micros === date(1970, 1, 1, 11, 30, 1))
+ }
+
+ test("missing year field with invalid date") {
+ // Use `SIMPLE_DATE_FORMAT`, so that the legacy parser also fails with invalid date.
+ val formatter = TimestampFormatter("MM-dd", UTC, LegacyDateFormats.SIMPLE_DATE_FORMAT, false)
+ withDefaultTimeZone(UTC)(intercept[DateTimeException](formatter.parse("02-29")))
+ }
+
+ test("missing am/pm field") {
+ val formatter = TimestampFormatter("yyyy hh:mm:ss", UTC)
+ val micros = formatter.parse("2009 11:30:01")
+ assert(micros === date(2009, 1, 1, 11, 30, 1))
+ }
+
+ test("missing time fields") {
+ val formatter = TimestampFormatter("yyyy HH", UTC)
+ val micros = formatter.parse("2009 11")
+ assert(micros === date(2009, 1, 1, 11))
+ }
}
diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
index fd33250..9be857e 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
@@ -86,7 +86,7 @@ select date_sub('2011-11-11', str) from v;
select null - date '2019-10-06';
select date '2001-10-01' - date '2001-09-28';
--- variable-length tests
+-- variable-length second fraction tests
select to_timestamp('2019-10-06 10:11:12.', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]');
select to_timestamp('2019-10-06 10:11:12.0', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]');
select to_timestamp('2019-10-06 10:11:12.1', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]');
@@ -95,7 +95,7 @@ select to_timestamp('2019-10-06 10:11:12.123UTC', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zz
select to_timestamp('2019-10-06 10:11:12.1234', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]');
select to_timestamp('2019-10-06 10:11:12.12345CST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]');
select to_timestamp('2019-10-06 10:11:12.123456PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]');
--- exceeded max variable length
+-- second fraction exceeded max variable length
select to_timestamp('2019-10-06 10:11:12.1234567PST', 'yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]');
-- special cases
select to_timestamp('123456 2019-10-06 10:11:12.123456PST', 'SSSSSS yyyy-MM-dd HH:mm:ss.SSSSSS[zzz]');
@@ -122,3 +122,11 @@ select to_timestamp("2019-10-06T10:11:12'12", "yyyy-MM-dd'T'HH:mm:ss''SSSS"); --
select to_timestamp("2019-10-06T10:11:12'", "yyyy-MM-dd'T'HH:mm:ss''"); -- tail
select to_timestamp("'2019-10-06T10:11:12", "''yyyy-MM-dd'T'HH:mm:ss"); -- head
select to_timestamp("P2019-10-06T10:11:12", "'P'yyyy-MM-dd'T'HH:mm:ss"); -- head but as single quote
+
+-- missing fields
+select to_timestamp("16", "dd");
+select to_timestamp("02-29", "MM-dd");
+select to_date("16", "dd");
+select to_date("02-29", "MM-dd");
+select to_timestamp("2019 40", "yyyy mm");
+select to_timestamp("2019 10:10:10", "yyyy hh:mm:ss");
diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
index 6c14eee..5bd78f5 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql
@@ -48,6 +48,21 @@ select from_json('[null, {"a":2}]', 'array<struct<a:int>>');
select from_json('[{"a": 1}, {"b":2}]', 'array<map<string,int>>');
select from_json('[{"a": 1}, 2]', 'array<map<string,int>>');
+-- from_json - datetime type
+select from_json('{"d": "2012-12-15", "t": "2012-12-15 15:15:15"}', 'd date, t timestamp');
+select from_json(
+ '{"d": "12/15 2012", "t": "12/15 2012 15:15:15"}',
+ 'd date, t timestamp',
+ map('dateFormat', 'MM/dd yyyy', 'timestampFormat', 'MM/dd yyyy HH:mm:ss'));
+select from_json(
+ '{"d": "02-29"}',
+ 'd date',
+ map('dateFormat', 'MM-dd'));
+select from_json(
+ '{"t": "02-29"}',
+ 't timestamp',
+ map('timestampFormat', 'MM-dd'));
+
-- to_json - array type
select to_json(array('1', '2', '3'));
select to_json(array(array(1, 2, 3), array(4)));
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
index 81a73a6..89596c7 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 85
+-- Number of queries: 91
-- !query
@@ -730,3 +730,51 @@ select to_timestamp("P2019-10-06T10:11:12", "'P'yyyy-MM-dd'T'HH:mm:ss")
struct<to_timestamp('P2019-10-06T10:11:12', '\'P\'yyyy-MM-dd\'T\'HH:mm:ss'):timestamp>
-- !query output
2019-10-06 10:11:12
+
+
+-- !query
+select to_timestamp("16", "dd")
+-- !query schema
+struct<to_timestamp('16', 'dd'):timestamp>
+-- !query output
+1970-01-16 00:00:00
+
+
+-- !query
+select to_timestamp("02-29", "MM-dd")
+-- !query schema
+struct<to_timestamp('02-29', 'MM-dd'):timestamp>
+-- !query output
+NULL
+
+
+-- !query
+select to_date("16", "dd")
+-- !query schema
+struct<to_date('16', 'dd'):date>
+-- !query output
+1970-01-16
+
+
+-- !query
+select to_date("02-29", "MM-dd")
+-- !query schema
+struct<to_date('02-29', 'MM-dd'):date>
+-- !query output
+NULL
+
+
+-- !query
+select to_timestamp("2019 40", "yyyy mm")
+-- !query schema
+struct<to_timestamp('2019 40', 'yyyy mm'):timestamp>
+-- !query output
+2019-01-01 00:40:00
+
+
+-- !query
+select to_timestamp("2019 10:10:10", "yyyy hh:mm:ss")
+-- !query schema
+struct<to_timestamp('2019 10:10:10', 'yyyy hh:mm:ss'):timestamp>
+-- !query output
+2019-01-01 10:10:10
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
index 2e60085..b26eb2f 100755
--- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 85
+-- Number of queries: 91
-- !query
@@ -702,3 +702,51 @@ select to_timestamp("P2019-10-06T10:11:12", "'P'yyyy-MM-dd'T'HH:mm:ss")
struct<to_timestamp('P2019-10-06T10:11:12', '\'P\'yyyy-MM-dd\'T\'HH:mm:ss'):timestamp>
-- !query output
2019-10-06 10:11:12
+
+
+-- !query
+select to_timestamp("16", "dd")
+-- !query schema
+struct<to_timestamp('16', 'dd'):timestamp>
+-- !query output
+1970-01-16 00:00:00
+
+
+-- !query
+select to_timestamp("02-29", "MM-dd")
+-- !query schema
+struct<to_timestamp('02-29', 'MM-dd'):timestamp>
+-- !query output
+NULL
+
+
+-- !query
+select to_date("16", "dd")
+-- !query schema
+struct<to_date('16', 'dd'):date>
+-- !query output
+1970-01-16
+
+
+-- !query
+select to_date("02-29", "MM-dd")
+-- !query schema
+struct<to_date('02-29', 'MM-dd'):date>
+-- !query output
+NULL
+
+
+-- !query
+select to_timestamp("2019 40", "yyyy mm")
+-- !query schema
+struct<to_timestamp('2019 40', 'yyyy mm'):timestamp>
+-- !query output
+2019-01-01 00:40:00
+
+
+-- !query
+select to_timestamp("2019 10:10:10", "yyyy hh:mm:ss")
+-- !query schema
+struct<to_timestamp('2019 10:10:10', 'yyyy hh:mm:ss'):timestamp>
+-- !query output
+2019-01-01 10:10:10
diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
index 21a3531..665c79c 100644
--- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
--- Number of queries: 42
+-- Number of queries: 46
-- !query
@@ -289,6 +289,49 @@ NULL
-- !query
+select from_json('{"d": "2012-12-15", "t": "2012-12-15 15:15:15"}', 'd date, t timestamp')
+-- !query schema
+struct<from_json({"d": "2012-12-15", "t": "2012-12-15 15:15:15"}):struct<d:date,t:timestamp>>
+-- !query output
+{"d":2012-12-15,"t":2012-12-15 15:15:15}
+
+
+-- !query
+select from_json(
+ '{"d": "12/15 2012", "t": "12/15 2012 15:15:15"}',
+ 'd date, t timestamp',
+ map('dateFormat', 'MM/dd yyyy', 'timestampFormat', 'MM/dd yyyy HH:mm:ss'))
+-- !query schema
+struct<from_json({"d": "12/15 2012", "t": "12/15 2012 15:15:15"}):struct<d:date,t:timestamp>>
+-- !query output
+{"d":2012-12-15,"t":2012-12-15 15:15:15}
+
+
+-- !query
+select from_json(
+ '{"d": "02-29"}',
+ 'd date',
+ map('dateFormat', 'MM-dd'))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkUpgradeException
+You may get a different result due to the upgrading of Spark 3.0: Fail to parse '02-29' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.
+
+
+-- !query
+select from_json(
+ '{"t": "02-29"}',
+ 't timestamp',
+ map('timestampFormat', 'MM-dd'))
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkUpgradeException
+You may get a different result due to the upgrading of Spark 3.0: Fail to parse '02-29' in the new parser. You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0, or set to CORRECTED and treat it as an invalid datetime string.
+
+
+-- !query
select to_json(array('1', '2', '3'))
-- !query schema
struct<to_json(array(1, 2, 3)):string>
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org