You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/02/05 11:02:18 UTC

[spark] branch branch-3.0 updated: [SPARK-30668][SQL] Support `SimpleDateFormat` patterns in parsing timestamps/dates strings

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 92f5723  [SPARK-30668][SQL] Support `SimpleDateFormat` patterns in parsing timestamps/dates strings
92f5723 is described below

commit 92f57237871400ab9d499e1174af22a867c01988
Author: Maxim Gekk <ma...@gmail.com>
AuthorDate: Wed Feb 5 18:48:45 2020 +0800

    [SPARK-30668][SQL] Support `SimpleDateFormat` patterns in parsing timestamps/dates strings
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to partially revert the commit https://github.com/apache/spark/commit/51a6ba0181a013f2b62b47184785a8b6f6a78f12, and provide a legacy parser based on `FastDateFormat` which is compatible to `SimpleDateFormat`.
    
    To enable the legacy parser, set `spark.sql.legacy.timeParser.enabled` to `true`.
    
    ### Why are the changes needed?
    To allow users to restore old behavior in parsing timestamps/dates using `SimpleDateFormat` patterns. The main reason for restoring is `DateTimeFormatter`'s patterns are not fully compatible to `SimpleDateFormat` patterns, see https://issues.apache.org/jira/browse/SPARK-30668
    
    ### Does this PR introduce any user-facing change?
    Yes
    
    ### How was this patch tested?
    - Added new test to `DateFunctionsSuite`
    - Restored additional test cases in `JsonInferSchemaSuite`.
    
    Closes #27441 from MaxGekk/support-simpledateformat.
    
    Authored-by: Maxim Gekk <ma...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit 459e757ed40fd1cdd37911d3f57b48b54ca2fff7)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 docs/sql-migration-guide.md                        |  4 +-
 .../spark/sql/catalyst/util/DateFormatter.scala    | 35 ++++++++--
 .../sql/catalyst/util/TimestampFormatter.scala     | 38 +++++++++--
 .../org/apache/spark/sql/internal/SQLConf.scala    | 10 +++
 .../sql/catalyst/json/JsonInferSchemaSuite.scala   | 79 +++++++++++++---------
 .../org/apache/spark/sql/DateFunctionsSuite.scala  | 14 ++++
 6 files changed, 136 insertions(+), 44 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 0c47370..5a5e802 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -67,9 +67,7 @@ license: |
 
   - Since Spark 3.0, Proleptic Gregorian calendar is used in parsing, formatting, and converting dates and timestamps as well as in extracting sub-components like years, days and etc. Spark 3.0 uses Java 8 API classes from the java.time packages that based on ISO chronology (https://docs.oracle.com/javase/8/docs/api/java/time/chrono/IsoChronology.html). In Spark version 2.4 and earlier, those operations are performed by using the hybrid calendar (Julian + Gregorian, see https://docs.orac [...]
 
-    - CSV/JSON datasources use java.time API for parsing and generating CSV/JSON content. In Spark version 2.4 and earlier, java.text.SimpleDateFormat is used for the same purpose with fallbacks to the parsing mechanisms of Spark 2.0 and 1.x. For example, `2018-12-08 10:39:21.123` with the pattern `yyyy-MM-dd'T'HH:mm:ss.SSS` cannot be parsed since Spark 3.0 because the timestamp does not match to the pattern but it can be parsed by earlier Spark versions due to a fallback to `Timestamp.v [...]
-
-    - The `unix_timestamp`, `date_format`, `to_unix_timestamp`, `from_unixtime`, `to_date`, `to_timestamp` functions. New implementation supports pattern formats as described here https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html and performs strict checking of its input. For example, the `2015-07-22 10:00:00` timestamp cannot be parse if pattern is `yyyy-MM-dd` because the parser does not consume whole input. Another example is the `31/01/2015 00:00` inpu [...]
+    - Parsing/formatting of timestamp/date strings. This effects on CSV/JSON datasources and on the `unix_timestamp`, `date_format`, `to_unix_timestamp`, `from_unixtime`, `to_date`, `to_timestamp` functions when patterns specified by users is used for parsing and formatting. Since Spark 3.0, the conversions are based on `java.time.format.DateTimeFormatter`, see https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html. New implementation performs strict checking o [...]
 
     - The `weekofyear`, `weekday`, `dayofweek`, `date_trunc`, `from_utc_timestamp`, `to_utc_timestamp`, and `unix_timestamp` functions use java.time API for calculation week number of year, day number of week as well for conversion from/to TimestampType values in UTC time zone.
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
index 7f982b0..28189b6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
@@ -20,7 +20,10 @@ package org.apache.spark.sql.catalyst.util
 import java.time.{LocalDate, ZoneId}
 import java.util.Locale
 
-import DateTimeUtils.{convertSpecialDate, localDateToDays}
+import org.apache.commons.lang3.time.FastDateFormat
+
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.{convertSpecialDate, localDateToDays}
+import org.apache.spark.sql.internal.SQLConf
 
 sealed trait DateFormatter extends Serializable {
   def parse(s: String): Int // returns days since epoch
@@ -48,17 +51,41 @@ class Iso8601DateFormatter(
   }
 }
 
+class LegacyDateFormatter(pattern: String, locale: Locale) extends DateFormatter {
+  @transient
+  private lazy val format = FastDateFormat.getInstance(pattern, locale)
+
+  override def parse(s: String): Int = {
+    val milliseconds = format.parse(s).getTime
+    DateTimeUtils.millisToDays(milliseconds)
+  }
+
+  override def format(days: Int): String = {
+    val date = DateTimeUtils.toJavaDate(days)
+    format.format(date)
+  }
+}
+
 object DateFormatter {
-  val defaultPattern: String = "uuuu-MM-dd"
   val defaultLocale: Locale = Locale.US
 
   def apply(format: String, zoneId: ZoneId, locale: Locale): DateFormatter = {
-    new Iso8601DateFormatter(format, zoneId, locale)
+    if (SQLConf.get.legacyTimeParserEnabled) {
+      new LegacyDateFormatter(format, locale)
+    } else {
+      new Iso8601DateFormatter(format, zoneId, locale)
+    }
   }
 
   def apply(format: String, zoneId: ZoneId): DateFormatter = {
     apply(format, zoneId, defaultLocale)
   }
 
-  def apply(zoneId: ZoneId): DateFormatter = apply(defaultPattern, zoneId)
+  def apply(zoneId: ZoneId): DateFormatter = {
+    if (SQLConf.get.legacyTimeParserEnabled) {
+      new LegacyDateFormatter("yyyy-MM-dd", defaultLocale)
+    } else {
+      new Iso8601DateFormatter("uuuu-MM-dd", zoneId, defaultLocale)
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
index 5be4807..fe1a4fe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -22,10 +22,14 @@ import java.time._
 import java.time.format.DateTimeParseException
 import java.time.temporal.ChronoField.MICRO_OF_SECOND
 import java.time.temporal.TemporalQueries
-import java.util.Locale
+import java.util.{Locale, TimeZone}
 import java.util.concurrent.TimeUnit.SECONDS
 
-import DateTimeUtils.convertSpecialTimestamp
+import org.apache.commons.lang3.time.FastDateFormat
+
+import org.apache.spark.sql.catalyst.util.DateTimeConstants.MICROS_PER_MILLIS
+import org.apache.spark.sql.catalyst.util.DateTimeUtils.convertSpecialTimestamp
+import org.apache.spark.sql.internal.SQLConf
 
 sealed trait TimestampFormatter extends Serializable {
   /**
@@ -86,12 +90,32 @@ class FractionTimestampFormatter(zoneId: ZoneId)
   override protected lazy val formatter = DateTimeFormatterHelper.fractionFormatter
 }
 
+class LegacyTimestampFormatter(
+    pattern: String,
+    zoneId: ZoneId,
+    locale: Locale) extends TimestampFormatter {
+
+  @transient private lazy val format =
+    FastDateFormat.getInstance(pattern, TimeZone.getTimeZone(zoneId), locale)
+
+  protected def toMillis(s: String): Long = format.parse(s).getTime
+
+  override def parse(s: String): Long = toMillis(s) * MICROS_PER_MILLIS
+
+  override def format(us: Long): String = {
+    format.format(DateTimeUtils.toJavaTimestamp(us))
+  }
+}
+
 object TimestampFormatter {
-  val defaultPattern: String = "uuuu-MM-dd HH:mm:ss"
   val defaultLocale: Locale = Locale.US
 
   def apply(format: String, zoneId: ZoneId, locale: Locale): TimestampFormatter = {
-    new Iso8601TimestampFormatter(format, zoneId, locale)
+    if (SQLConf.get.legacyTimeParserEnabled) {
+      new LegacyTimestampFormatter(format, zoneId, locale)
+    } else {
+      new Iso8601TimestampFormatter(format, zoneId, locale)
+    }
   }
 
   def apply(format: String, zoneId: ZoneId): TimestampFormatter = {
@@ -99,7 +123,11 @@ object TimestampFormatter {
   }
 
   def apply(zoneId: ZoneId): TimestampFormatter = {
-    apply(defaultPattern, zoneId, defaultLocale)
+    if (SQLConf.get.legacyTimeParserEnabled) {
+      new LegacyTimestampFormatter("yyyy-MM-dd HH:mm:ss", zoneId, defaultLocale)
+    } else {
+      new Iso8601TimestampFormatter("uuuu-MM-dd HH:mm:ss", zoneId, defaultLocale)
+    }
   }
 
   def getFractionFormatter(zoneId: ZoneId): TimestampFormatter = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 5ce5692..acc0922 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2159,6 +2159,14 @@ object SQLConf {
       .checkValue(_ > 0, "The value of spark.sql.addPartitionInBatch.size must be positive")
       .createWithDefault(100)
 
+  val LEGACY_TIME_PARSER_ENABLED = buildConf("spark.sql.legacy.timeParser.enabled")
+    .internal()
+    .doc("When set to true, java.text.SimpleDateFormat is used for formatting and parsing " +
+      "dates/timestamps in a locale-sensitive manner. When set to false, classes from " +
+      "java.time.* packages are used for the same purpose.")
+    .booleanConf
+    .createWithDefault(false)
+
   /**
    * Holds information about keys that have been deprecated.
    *
@@ -2447,6 +2455,8 @@ class SQLConf extends Serializable with Logging {
   def legacyMsSqlServerNumericMappingEnabled: Boolean =
     getConf(LEGACY_MSSQLSERVER_NUMERIC_MAPPING_ENABLED)
 
+  def legacyTimeParserEnabled: Boolean = getConf(SQLConf.LEGACY_TIME_PARSER_ENABLED)
+
   /**
    * Returns the [[Resolver]] for the current configuration, which can be used to determine if two
    * identifiers are equal.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
index a48e618..c2e03bd 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.sql.catalyst.json
 
-import com.fasterxml.jackson.core.JsonFactory
-
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.plans.SQLHelper
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 
 class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
@@ -41,45 +40,61 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
   }
 
   test("inferring timestamp type") {
-    checkTimestampType("yyyy", """{"a": "2018"}""")
-    checkTimestampType("yyyy=MM", """{"a": "2018=12"}""")
-    checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""")
-    checkTimestampType(
-      "yyyy-MM-dd'T'HH:mm:ss.SSS",
-      """{"a": "2018-12-02T21:04:00.123"}""")
-    checkTimestampType(
-      "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
-      """{"a": "2018-12-02T21:04:00.123567+01:00"}""")
+    Seq(true, false).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        checkTimestampType("yyyy", """{"a": "2018"}""")
+        checkTimestampType("yyyy=MM", """{"a": "2018=12"}""")
+        checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""")
+        checkTimestampType(
+          "yyyy-MM-dd'T'HH:mm:ss.SSS",
+          """{"a": "2018-12-02T21:04:00.123"}""")
+        checkTimestampType(
+          "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
+          """{"a": "2018-12-02T21:04:00.123567+01:00"}""")
+      }
+    }
   }
 
   test("prefer decimals over timestamps") {
-    checkType(
-      options = Map(
-        "prefersDecimal" -> "true",
-        "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
-      ),
-      json = """{"a": "20181202.210400123"}""",
-      dt = DecimalType(17, 9)
-    )
+    Seq(true, false).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        checkType(
+          options = Map(
+            "prefersDecimal" -> "true",
+            "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
+          ),
+          json = """{"a": "20181202.210400123"}""",
+          dt = DecimalType(17, 9)
+        )
+      }
+    }
   }
 
   test("skip decimal type inferring") {
-    checkType(
-      options = Map(
-        "prefersDecimal" -> "false",
-        "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
-      ),
-      json = """{"a": "20181202.210400123"}""",
-      dt = TimestampType
-    )
+    Seq(true, false).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        checkType(
+          options = Map(
+            "prefersDecimal" -> "false",
+            "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
+          ),
+          json = """{"a": "20181202.210400123"}""",
+          dt = TimestampType
+        )
+      }
+    }
   }
 
   test("fallback to string type") {
-    checkType(
-      options = Map("timestampFormat" -> "yyyy,MM,dd.HHmmssSSS"),
-      json = """{"a": "20181202.210400123"}""",
-      dt = StringType
-    )
+    Seq(true, false).foreach { legacyParser =>
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> legacyParser.toString) {
+        checkType(
+          options = Map("timestampFormat" -> "yyyy,MM,dd.HHmmssSSS"),
+          json = """{"a": "20181202.210400123"}""",
+          dt = StringType
+        )
+      }
+    }
   }
 
   test("disable timestamp inferring") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
index d7d8c2c..3b3d3cc 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DateFunctionsSuite.scala
@@ -789,4 +789,18 @@ class DateFunctionsSuite extends QueryTest with SharedSparkSession {
         Row(Timestamp.valueOf("2015-07-24 07:00:00")),
         Row(Timestamp.valueOf("2015-07-24 22:00:00"))))
   }
+
+  test("SPARK-30668: use legacy timestamp parser in to_timestamp") {
+    def checkTimeZoneParsing(expected: Any): Unit = {
+      val df = Seq("2020-01-27T20:06:11.847-0800").toDF("ts")
+      checkAnswer(df.select(to_timestamp(col("ts"), "yyyy-MM-dd'T'HH:mm:ss.SSSz")),
+        Row(expected))
+    }
+    withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "true") {
+      checkTimeZoneParsing(Timestamp.valueOf("2020-01-27 20:06:11.847"))
+    }
+    withSQLConf(SQLConf.LEGACY_TIME_PARSER_ENABLED.key -> "false") {
+      checkTimeZoneParsing(null)
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org