You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/03/05 12:50:41 UTC

[spark] branch master updated: [SPARK-31005][SQL] Support time zone ids in casting strings to timestamps

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 1fd9a91  [SPARK-31005][SQL] Support time zone ids in casting strings to timestamps
1fd9a91 is described below

commit 1fd9a91c662a368e348ef96604a79929f814041c
Author: Maxim Gekk <ma...@gmail.com>
AuthorDate: Thu Mar 5 20:49:43 2020 +0800

    [SPARK-31005][SQL] Support time zone ids in casting strings to timestamps
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to change `DateTimeUtils.stringToTimestamp` to support any valid time zone id at the end of input string. After the changes, the function accepts zone ids in the formats:
    - no zone id. In that case, the function uses the local session time zone from the SQL config `spark.sql.session.timeZone`
    - -[h]h:[m]m
    - +[h]h:[m]m
    - Z
    - Short zone id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
    - Zone ID starts with 'UTC+', 'UTC-', 'GMT+', 'GMT-', 'UT+' or 'UT-'. The ID is split in two, with a two or three letter prefix and a suffix starting with the sign. The suffix must be in the formats:
      - +|-h[h]
      - +|-hh[:]mm
      - +|-hh:mm:ss
      - +|-hhmmss
    - Region-based zone IDs in the form `{area}/{city}`, such as `Europe/Paris` or `America/New_York`. The default set of region ids is supplied by the IANA Time Zone Database (TZDB).
    
    ### Why are the changes needed?
    - To use `stringToTimestamp` as a substitution of removed `stringToTime`, see https://github.com/apache/spark/pull/27710#discussion_r385020173
    - Improve UX of Spark SQL by allowing flexible formats of zone ids. Currently, Spark accepts only `Z` and zone offsets that can be inconvenient when a time zone offset is shifted due to daylight saving rules. For instance:
    ```sql
    spark-sql> select cast('2015-03-18T12:03:17.123456 Europe/Moscow' as timestamp);
    NULL
    ```
    
    ### Does this PR introduce any user-facing change?
    Yes. After the changes, casting strings to timestamps allows time zone id at the end of the strings:
    ```sql
    spark-sql> select cast('2015-03-18T12:03:17.123456 Europe/Moscow' as timestamp);
    2015-03-18 12:03:17.123456
    ```
    
    ### How was this patch tested?
    - Added new test cases to the `string to timestamp` test in `DateTimeUtilsSuite`.
    - Run `CastSuite` and `AnsiCastSuite`.
    
    Closes #27753 from MaxGekk/stringToTimestamp-uni-zoneId.
    
    Authored-by: Maxim Gekk <ma...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../spark/sql/catalyst/util/DateTimeUtils.scala    | 57 +++++++++++-----------
 .../sql/catalyst/util/DateTimeUtilsSuite.scala     | 23 +++++++--
 2 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 6b252ec..3a038a4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -185,28 +185,28 @@ object DateTimeUtils {
    * `yyyy-[m]m`
    * `yyyy-[m]m-[d]d`
    * `yyyy-[m]m-[d]d `
-   * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
-   * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
-   * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
-   * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
-   * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
-   * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
-   * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
-   * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
-   * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
-   * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
-   * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
-   * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
-   * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]`
-   * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]Z`
-   * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]-[h]h:[m]m`
-   * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us]+[h]h:[m]m`
+   * `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+   * `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+   * `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+   * `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
+   *
+   * where `zone_id` should have one of the forms:
+   *   - Z - Zulu time zone UTC+0
+   *   - +|-[h]h:[m]m
+   *   - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
+   *   - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
+   *     and a suffix in the formats:
+   *     - +|-h[h]
+   *     - +|-hh[:]mm
+   *     - +|-hh:mm:ss
+   *     - +|-hhmmss
+   *  - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
    */
   def stringToTimestamp(s: UTF8String, timeZoneId: ZoneId): Option[SQLTimestamp] = {
     if (s == null) {
       return None
     }
-    var tz: Option[Byte] = None
+    var tz: Option[String] = None
     val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0)
     var i = 0
     var currentSegmentValue = 0
@@ -257,22 +257,21 @@ object DateTimeUtils {
             return None
           }
         } else if (i == 5 || i == 6) {
-          if (b == 'Z') {
+          if (b == '-' || b == '+') {
             segments(i) = currentSegmentValue
             currentSegmentValue = 0
             i += 1
-            tz = Some(43)
-          } else if (b == '-' || b == '+') {
+            tz = Some(new String(bytes, j, 1))
+          } else if (b == '.' && i == 5) {
             segments(i) = currentSegmentValue
             currentSegmentValue = 0
             i += 1
-            tz = Some(b)
-          } else if (b == '.' && i == 5) {
+          } else {
             segments(i) = currentSegmentValue
             currentSegmentValue = 0
             i += 1
-          } else {
-            return None
+            tz = Some(new String(bytes, j, bytes.length - j))
+            j = bytes.length - 1
           }
           if (i == 6  && b != '.') {
             i += 1
@@ -312,11 +311,11 @@ object DateTimeUtils {
       digitsMilli -= 1
     }
     try {
-      val zoneId = if (tz.isEmpty) {
-        timeZoneId
-      } else {
-        val sign = if (tz.get.toChar == '-') -1 else 1
-        ZoneOffset.ofHoursMinutes(sign * segments(7), sign * segments(8))
+      val zoneId = tz match {
+        case None => timeZoneId
+        case Some("+") => ZoneOffset.ofHoursMinutes(segments(7), segments(8))
+        case Some("-") => ZoneOffset.ofHoursMinutes(-segments(7), -segments(8))
+        case Some(zoneName: String) => getZoneId(zoneName.trim)
       }
       val nanoseconds = MICROSECONDS.toNanos(segments(6))
       val localTime = LocalTime.of(segments(3), segments(4), segments(5), nanoseconds.toInt)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 1465b06..6dde3d2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -183,24 +183,29 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
       var zoneId = getZoneId("GMT-13:53")
       expected = Option(date(2015, 3, 18, 12, 3, 17, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17-13:53", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17GMT-13:53", expected)
 
       zoneId = getZoneId("UTC")
       expected = Option(date(2015, 3, 18, 12, 3, 17, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17Z", expected)
       checkStringToTimestamp("2015-03-18 12:03:17Z", expected)
+      checkStringToTimestamp("2015-03-18 12:03:17UTC", expected)
 
       zoneId = getZoneId("GMT-01:00")
       expected = Option(date(2015, 3, 18, 12, 3, 17, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17-1:0", expected)
       checkStringToTimestamp("2015-03-18T12:03:17-01:00", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17GMT-01:00", expected)
 
       zoneId = getZoneId("GMT+07:30")
       expected = Option(date(2015, 3, 18, 12, 3, 17, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17+07:30", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17 GMT+07:30", expected)
 
       zoneId = getZoneId("GMT+07:03")
       expected = Option(date(2015, 3, 18, 12, 3, 17, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17+07:03", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17GMT+07:03", expected)
 
       // tests for the string including milliseconds.
       expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, zid = zid))
@@ -213,27 +218,32 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
       expected = Option(date(2015, 3, 18, 12, 3, 17, 456000, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.456Z", expected)
       checkStringToTimestamp("2015-03-18 12:03:17.456Z", expected)
+      checkStringToTimestamp("2015-03-18 12:03:17.456 UTC", expected)
 
       zoneId = getZoneId("GMT-01:00")
       expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.123-1:0", expected)
       checkStringToTimestamp("2015-03-18T12:03:17.123-01:00", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17.123 GMT-01:00", expected)
 
       zoneId = getZoneId("GMT+07:30")
       expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.123+07:30", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17.123 GMT+07:30", expected)
 
       zoneId = getZoneId("GMT+07:30")
       expected = Option(date(2015, 3, 18, 12, 3, 17, 123000, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.123+07:30", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17.123GMT+07:30", expected)
 
-      zoneId = getZoneId("GMT+07:30")
       expected = Option(date(2015, 3, 18, 12, 3, 17, 123121, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.123121+7:30", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17.123121 GMT+0730", expected)
 
       zoneId = getZoneId("GMT+07:30")
       expected = Option(date(2015, 3, 18, 12, 3, 17, 123120, zid = zoneId))
       checkStringToTimestamp("2015-03-18T12:03:17.12312+7:30", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17.12312 UT+07:30", expected)
 
       expected = Option(time(18, 12, 15, zid = zid))
       checkStringToTimestamp("18:12:15", expected)
@@ -241,10 +251,12 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
       zoneId = getZoneId("GMT+07:30")
       expected = Option(time(18, 12, 15, 123120, zid = zoneId))
       checkStringToTimestamp("T18:12:15.12312+7:30", expected)
+      checkStringToTimestamp("T18:12:15.12312 UTC+07:30", expected)
 
       zoneId = getZoneId("GMT+07:30")
       expected = Option(time(18, 12, 15, 123120, zid = zoneId))
       checkStringToTimestamp("18:12:15.12312+7:30", expected)
+      checkStringToTimestamp("18:12:15.12312 GMT+07:30", expected)
 
       expected = Option(date(2011, 5, 6, 7, 8, 9, 100000, zid = zid))
       checkStringToTimestamp("2011-05-06 07:08:09.1000", expected)
@@ -270,8 +282,13 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
       // Truncating the fractional seconds
       zoneId = getZoneId("GMT+00:00")
       expected = Option(date(2015, 3, 18, 12, 3, 17, 123456, zid = zoneId))
-      checkStringToTimestamp(
-        "2015-03-18T12:03:17.123456789+0:00", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17.123456789+0:00", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17.123456789 UTC+0", expected)
+      checkStringToTimestamp("2015-03-18T12:03:17.123456789GMT+00:00", expected)
+
+      zoneId = getZoneId("Europe/Moscow")
+      expected = Option(date(2015, 3, 18, 12, 3, 17, 123456, zid = zoneId))
+      checkStringToTimestamp("2015-03-18T12:03:17.123456 Europe/Moscow", expected)
     }
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org