You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/02/05 11:02:18 UTC
[spark] branch branch-3.0 updated: [SPARK-30668][SQL] Support
`SimpleDateFormat` patterns in parsing timestamps/dates strings
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 92f5723 [SPARK-30668][SQL] Support `SimpleDateFormat` patterns in parsing timestamps/dates strings
92f5723 is described below
commit 92f57237871400ab9d499e1174af22a867c01988
Author: Maxim Gekk <ma...@gmail.com>
AuthorDate: Wed Feb 5 18:48:45 2020 +0800
[SPARK-30668][SQL] Support `SimpleDateFormat` patterns in parsing timestamps/dates strings
### What changes were proposed in this pull request?
In the PR, I propose to partially revert the commit https://github.com/apache/spark/commit/51a6ba0181a013f2b62b47184785a8b6f6a78f12, and provide a legacy parser based on `FastDateFormat` which is compatible to `SimpleDateFormat`.
To enable the legacy parser, set `spark.sql.legacy.timeParser.enabled` to `true`.
### Why are the changes needed?
To allow users to restore old behavior in parsing timestamps/dates using `SimpleDateFormat` patterns. The main reason for restoring is `DateTimeFormatter`'s patterns are not fully compatible to `SimpleDateFormat` patterns, see https://issues.apache.org/jira/browse/SPARK-30668
### Does this PR introduce any user-facing change?
Yes
### How was this patch tested?
- Added new test to `DateFunctionsSuite`
- Restored additional test cases in `JsonInferSchemaSuite`.
Closes #27441 from MaxGekk/support-simpledateformat.
Authored-by: Maxim Gekk <ma...@gmail.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
(cherry picked from commit 459e757ed40fd1cdd37911d3f57b48b54ca2fff7)
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
docs/sql-migration-guide.md | 4 +-
.../spark/sql/catalyst/util/DateFormatter.scala | 35 ++++++++--
.../sql/catalyst/util/TimestampFormatter.scala | 38 +++++++++--
.../org/apache/spark/sql/internal/SQLConf.scala | 10 +++
.../sql/catalyst/json/JsonInferSchemaSuite.scala | 79 +++++++++++++---------
.../org/apache/spark/sql/DateFunctionsSuite.scala | 14 ++++
6 files changed, 136 insertions(+), 44 deletions(-)
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 0c47370..5a5e802 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -67,9 +67,7 @@ license: |
- Since Spark 3.0, Proleptic Gregorian calendar is used in parsing, formatting, and converting dates and timestamps as well as in extracting sub-components like years, days and etc. Spark 3.0 uses Java 8 API classes from the java.time packages that based on ISO chronology (https://docs.oracle.com/javase/8/docs/api/java/time/chrono/IsoChronology.html). In Spark version 2.4 and earlier, those operations are performed by using the hybrid calendar (Julian + Gregorian, see https://docs.orac [...]
- - CSV/JSON datasources use java.time API for parsing and generating CSV/JSON content. In Spark version 2.4 and earlier, java.text.SimpleDateFormat is used for the same purpose with fallbacks to the parsing mechanisms of Spark 2.0 and 1.x. For example, `2018-12-08 10:39:21.123` with the pattern `yyyy-MM-dd'T'HH:mm:ss.SSS` cannot be parsed since Spark 3.0 because the timestamp does not match to the pattern but it can be parsed by earlier Spark versions due to a fallback to `Timestamp.v [...]
-
- - The `unix_timestamp`, `date_format`, `to_unix_timestamp`, `from_unixtime`, `to_date`, `to_timestamp` functions. New implementation supports pattern formats as described here https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html and performs strict checking of its input. For example, the `2015-07-22 10:00:00` timestamp cannot be parse if pattern is `yyyy-MM-dd` because the parser does not consume whole input. Another example is the `31/01/2015 00:00` inpu [...]
+ - Parsing/formatting of timestamp/date strings. This effects on CSV/JSON datasources and on the `unix_timestamp`, `date_format`, `to_unix_timestamp`, `from_unixtime`, `to_date`, `to_timestamp` functions when patterns specified by users is used for parsing and formatting. Since Spark 3.0, the conversions are based on `java.time.format.DateTimeFormatter`, see https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html. New implementation performs strict checking o [...]
- The `weekofyear`, `weekday`, `dayofweek`, `date_trunc`, `from_utc_timestamp`, `to_utc_timestamp`, and `unix_timestamp` functions use java.time API for calculation week number of year, day number of week as well for conversion from/to TimestampType values in UTC time zone.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
index 7f982b0..28189b6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
@@ -20,7 +20,10 @@ package org.apache.spark.sql.catalyst.util
import java.time.{LocalDate, ZoneId}
import java.util.Locale
-import DateTimeUtils.{convertSpecialDate, localDateToDays}
+import org.apache.commons.lang3.time.FastDateFormat
+
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.{convertSpecialDate, localDateToDays}
+import org.apache.spark.sql.internal.SQLConf
sealed trait DateFormatter extends Serializable {
def parse(s: String): Int // returns days since epoch
@@ -48,17 +51,41 @@ class Iso8601DateFormatter(
}
}
+class LegacyDateFormatter(pattern: String, locale: Locale) extends DateFormatter {
+ @transient
+ private lazy val format = FastDateFormat.getInstance(pattern, locale)
+
+ override def parse(s: String): Int = {
+ val milliseconds = format.parse(s).getTime
+ DateTimeUtils.millisToDays(milliseconds)
+ }
+
+ override def format(days: Int): String = {
+ val date = DateTimeUtils.toJavaDate(days)
+ format.format(date)
+ }
+}
+
object DateFormatter {
- val defaultPattern: String = "uuuu-MM-dd"
val defaultLocale: Locale = Locale.US
def apply(format: String, zoneId: ZoneId, locale: Locale): DateFormatter = {
- new Iso8601DateFormatter(format, zoneId, locale)
+ if (SQLConf.get.legacyTimeParserEnabled) {
+ new LegacyDateFormatter(format, locale)
+ } else {
+ new Iso8601DateFormatter(format, zoneId, locale)
+ }
}
def apply(format: String, zoneId: ZoneId): DateFormatter = {
apply(format, zoneId, defaultLocale)
}
- def apply(zoneId: ZoneId): DateFormatter = apply(defaultPattern, zoneId)
+ def apply(zoneId: ZoneId): DateFormatter = {
+ if (SQLConf.get.legacyTimeParserEnabled) {
+ new LegacyDateFormatter("yyyy-MM-dd", defaultLocale)
+ } else {
+ new Iso8601DateFormatter("uuuu-MM-dd", zoneId, defaultLocale)
+ }
+ }
}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
index 5be4807..fe1a4fe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -22,10 +22,14 @@ import java.time._
import java.time.format.DateTimeParseException
import java.time.temporal.ChronoField.MICRO_OF_SECOND
import java.time.temporal.TemporalQueries
-import java.util.Locale
+import java.util.{Locale, TimeZone}
import java.util.concurrent.TimeUnit.SECONDS
-import DateTimeUtils.convertSpecialTimestamp
+import org.apache.commons.lang3.time.FastDateFormat
+
+import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_MILLIS
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.convertSpecialTimestamp
+import org.apache.spark.sql.internal.SQLConf
sealed trait TimestampFormatter extends Serializable {
/**
@@ -86,12 +90,32 @@ class FractionTimestampFormatter(zoneId: ZoneId)
override protected lazy val formatter = DateTimeFormatterHelper.fractionFormatter
}
+class LegacyTimestampFormatter(
+ pattern: String,
+ zoneId: ZoneId,
+ locale: Locale) extends TimestampFormatter {
+
+ @transient private lazy val format =
+ FastDateFormat.getInstance(pattern, TimeZone.getTimeZone(zoneId), locale)
+
+ protected def toMillis(s: String): Long = format.parse(s).getTime
+
+ override def parse(s: String): Long = toMillis(s) * MICROS_PER_MILLIS
+
+ override def format(us: Long): String = {
+ format.format(DateTimeUtils.toJavaTimestamp(us))
+ }
+}
+
object TimestampFormatter {
- val defaultPattern: String = "uuuu-MM-dd HH:mm:ss"
val defaultLocale: Locale = Locale.US
def apply(format: String, zoneId: ZoneId, locale: Locale): TimestampFormatter = {
- new Iso8601TimestampFormatter(format, zoneId, locale)
+ if (SQLConf.get.legacyTimeParserEnabled) {
+ new LegacyTimestampFormatter(format, zoneId, locale)
+ } else {
+ new Iso8601TimestampFormatter(format, zoneId, locale)
+ }
}
def apply(format: String, zoneId: ZoneId): TimestampFormatter = {
@@ -99,7 +123,11 @@ object TimestampFormatter {
}
def apply(zoneId: ZoneId): TimestampFormatter = {
- apply(defaultPattern, zoneId, defaultLocale)
+ if (SQLConf.get.legacyTimeParserEnabled) {
+ new LegacyTimestampFormatter("yyyy-MM-dd HH:mm:ss", zoneId, defaultLocale)
+ } else {
+ new Iso8601TimestampFormatter("uuuu-MM-dd HH:mm:ss", zoneId, defaultLocale)
+ }
}
def getFractionFormatter(zoneId: ZoneId): TimestampFormatter = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 5ce5692..acc0922 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2159,6 +2159,14 @@ object SQLConf {
.checkValue(_ > 0, "The value of spark.sql.addPartitionInBatch.size must be positive")
.createWithDefault(100)
+ val LEGACY_TIME_PARSER_ENABLED = buildConf("spark.sql.legacy.timeParser.enabled")
+ .internal()
+ .doc("When set to true, java.text.SimpleDateFormat is used for formatting and parsing " +
+ "dates/timestamps in a locale-sensitive manner. When set to false, classes from " +
+ "java.time.* packages are used for the same purpose.")
+ .booleanConf
+ .createWithDefault(false)
+
/**
* Holds information about keys that have been deprecated.
*
@@ -2447,6 +2455,8 @@ class SQLConf extends Serializable with Logging {
def legacyMsSqlServerNumericMappingEnabled: Boolean =
getConf(LEGACY_MSSQLSERVER_NUMERIC_MAPPING_ENABLED)
+ def legacyTimeParserEnabled: Boolean = getConf(SQLConf.LEGACY_TIME_PARSER_ENABLED)
+
/**
* Returns the [[Resolver]] for the current configuration, which can be used to determine if two
* identifiers are equal.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
index a48e618..c2e03bd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
@@ -17,10 +17,9 @@
package org.apache.spark.sql.catalyst.json
-import com.fasterxml.jackson.core.JsonFactory
-
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.plans.SQLHelper
+import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
@@ -41,45 +40,61 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
}
test("inferring timestamp type") {
- checkTimestampType("yyyy", """{"a": "2018"}""")
- checkTimestampType("yyyy=MM", """{"a": "2018=12"}""")
- checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""")
- checkTimestampType(
- "yyyy-MM-dd'T'HH:mm:ss.SSS",
- """{"a": "2018-12-02T21:04:00.123"}""")
- checkTimestampType(
- "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
- """{"a": "2018-12-02T21:04:00.123567+01:00"}""")
+ Seq(true, false).foreach { legacyParser =>
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+ checkTimestampType("yyyy", """{"a": "2018"}""")
+ checkTimestampType("yyyy=MM", """{"a": "2018=12"}""")
+ checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""")
+ checkTimestampType(
+ "yyyy-MM-dd'T'HH:mm:ss.SSS",
+ """{"a": "2018-12-02T21:04:00.123"}""")
+ checkTimestampType(
+ "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
+ """{"a": "2018-12-02T21:04:00.123567+01:00"}""")
+ }
+ }
}
test("prefer decimals over timestamps") {
- checkType(
- options = Map(
- "prefersDecimal" -> "true",
- "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
- ),
- json = """{"a": "20181202.210400123"}""",
- dt = DecimalType(17, 9)
- )
+ Seq(true, false).foreach { legacyParser =>
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+ checkType(
+ options = Map(
+ "prefersDecimal" -> "true",
+ "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
+ ),
+ json = """{"a": "20181202.210400123"}""",
+ dt = DecimalType(17, 9)
+ )
+ }
+ }
}
test("skip decimal type inferring") {
- checkType(
- options = Map(
- "prefersDecimal" -> "false",
- "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
- ),
- json = """{"a": "20181202.210400123"}""",
- dt = TimestampType
- )
+ Seq(true, false).foreach { legacyParser =>
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+ checkType(
+ options = Map(
+ "prefersDecimal" -> "false",
+ "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
+ ),
+ json = """{"a": "20181202.210400123"}""",
+ dt = TimestampType
+ )
+ }
+ }
}
test("fallback to string type") {
- checkType(
- options = Map("timestampFormat" -> "yyyy,MM,dd.HHmmssSSS"),
- json = """{"a": "20181202.210400123"}""",
- dt = StringType
- )
+ Seq(true, false).foreach { legacyParser =>
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+ checkType(
+ options = Map("timestampFormat" -> "yyyy,MM,dd.HHmmssSSS"),
+ json = """{"a": "20181202.210400123"}""",
+ dt = StringType
+ )
+ }
+ }
}
test("disable timestamp inferring") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index d7d8c2c..3b3d3cc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -789,4 +789,18 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
Row(Timestamp.valueOf("2015-07-24 07:00:00")),
Row(Timestamp.valueOf("2015-07-24 22:00:00"))))
}
+
+ test("SPARK-30668: use legacy timestamp parser in to_timestamp") {
+ def checkTimeZoneParsing(expected: Any): Unit = {
+ val df = Seq("2020-01-27T20:06:11.847-0800").toDF("ts")
+ checkAnswer(df.select(to_timestamp(col("ts"), "yyyy-MM-dd'T'HH:mm:ss.SSSz")),
+ Row(expected))
+ }
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "true") {
+ checkTimeZoneParsing(Timestamp.valueOf("2020-01-27 20:06:11.847"))
+ }
+ withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "false") {
+ checkTimeZoneParsing(null)
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org