You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/06/16 06:02:53 UTC

[spark] branch branch-3.0 updated: [SPARK-31986][SQL] Fix Julian-Gregorian micros rebasing of overlapping local timestamps

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 6367d12  [SPARK-31986][SQL] Fix Julian-Gregorian micros rebasing of overlapping local timestamps
6367d12 is described below

commit 6367d129e7e17d4baf7240fdc2b3f52f3c097b8f
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Tue Jun 16 06:00:05 2020 +0000

    [SPARK-31986][SQL] Fix Julian-Gregorian micros rebasing of overlapping local timestamps
    
    ### What changes were proposed in this pull request?
    It fixes microseconds rebasing from the hybrid calendar (Julian + Gregorian) to Proleptic Gregorian calendar in the function `RebaseDateTime`.`rebaseJulianToGregorianMicros(zoneId: ZoneId, micros: Long): Long` in the case of local timestamp overlapping.
    
    In the case of overlapping, we look ahead of 1 day to determinate which instant we should take - earlier or later zoned timestamp. If our current standard zone and DST offsets are equal to zone offset of the next date, we choose the later timestamp otherwise the earlier one. For example, the local timestamp **1945-11-18 01:30:00.0** can be mapped to two instants (microseconds since 1970-01-01 00:00:00Z): -761211000000000 or -761207400000000. If the first one is passed to `rebaseJulian [...]
    
    Note: The fix assumes that only one transition of standard or DST offsets can occur during a day.
    
    ### Why are the changes needed?
    Current implementation of `rebaseJulianToGregorianMicros()` handles timestamps overlapping only during daylight saving time but overlapping can happen also during transition from one standard time zone to another one. For example in the case of `Asia/Hong_Kong`, the time zone switched from `Japan Standard Time` (UTC+9) to `Hong Kong Time` (UTC+8) on _Sunday, 18 November, 1945 01:59:59 AM_. The changes allow to handle the special case as well.
    
    ### Does this PR introduce _any_ user-facing change?
    There is no behaviour change for timestamps of CE after 0001-01-01. The PR might affects timestamps of BCE for which the modified `rebaseJulianToGregorianMicros()` is called directly.
    
    ### How was this patch tested?
    
    1. By existing tests in `DateTimeUtilsSuite`, `RebaseDateTimeSuite`, `DateFunctionsSuite`, `DateExpressionsSuite` and `TimestampFormatterSuite`.
    
    2. Added new checks to `RebaseDateTimeSuite`.`SPARK-31959: JST -> HKT at Asia/Hong_Kong in 1945`:
    ```scala
          assert(rebaseJulianToGregorianMicros(hkZid, rebasedEarlierMicros) === earlierMicros)
          assert(rebaseJulianToGregorianMicros(hkZid, rebasedLaterMicros) === laterMicros)
    ```
    
    3. Regenerated `julian-gregorian-rebase-micros.json` with the step of 30 minutes, and got the same JSON file. The JSON file isn't affected because previously it was generated with the step of 1 week. And the spike in diffs/switch points during 1 hour of timestamp overlapping wasn't detected.
    
    Closes #28816 from MaxGekk/fix-overlap-julian-2-grep.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit e9145d41f3eae53dcee7d298ee1ae9d065645019)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../spark/sql/catalyst/util/RebaseDateTime.scala   | 57 ++++++++++++----------
 .../sql/catalyst/util/RebaseDateTimeSuite.scala    |  3 ++
 2 files changed, 34 insertions(+), 26 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
index 92d76c8..f93878d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.util
 import java.time.{LocalDate, LocalDateTime, LocalTime, ZoneId}
 import java.time.temporal.ChronoField
 import java.util.{Calendar, TimeZone}
+import java.util.Calendar.{DAY_OF_MONTH, DST_OFFSET, ERA, HOUR_OF_DAY, MINUTE, MONTH, SECOND, YEAR, ZONE_OFFSET}
 
 import scala.collection.mutable.AnyRefMap
 
@@ -102,15 +103,15 @@ object RebaseDateTime {
       .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY))
       .build()
     val localDate = LocalDate.of(
-      utcCal.get(Calendar.YEAR),
-      utcCal.get(Calendar.MONTH) + 1,
+      utcCal.get(YEAR),
+      utcCal.get(MONTH) + 1,
       // The number of days will be added later to handle non-existing
       // Julian dates in Proleptic Gregorian calendar.
       // For example, 1000-02-29 exists in Julian calendar because 1000
       // is a leap year but it is not a leap year in Gregorian calendar.
       1)
-      .`with`(ChronoField.ERA, utcCal.get(Calendar.ERA))
-      .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1)
+      .`with`(ChronoField.ERA, utcCal.get(ERA))
+      .plusDays(utcCal.get(DAY_OF_MONTH) - 1)
     Math.toIntExact(localDate.toEpochDay)
   }
 
@@ -350,9 +351,9 @@ object RebaseDateTime {
       // If so, we will take zone offsets from the previous day otherwise from the next day.
       // This assumes that transitions cannot happen often than once per 2 days.
       val shift = if (trans.getOffsetBefore == zonedDateTime.getOffset) -1 else 1
-      cloned.add(Calendar.DAY_OF_MONTH, shift)
-      cal.set(Calendar.ZONE_OFFSET, cloned.get(Calendar.ZONE_OFFSET))
-      cal.set(Calendar.DST_OFFSET, cloned.get(Calendar.DST_OFFSET))
+      cloned.add(DAY_OF_MONTH, shift)
+      cal.set(ZONE_OFFSET, cloned.get(ZONE_OFFSET))
+      cal.set(DST_OFFSET, cloned.get(DST_OFFSET))
     }
     fromMillis(cal.getTimeInMillis) + ldt.get(ChronoField.MICRO_OF_SECOND)
   }
@@ -413,32 +414,36 @@ object RebaseDateTime {
       .setInstant(toMillis(micros))
       .build()
     val localDateTime = LocalDateTime.of(
-      cal.get(Calendar.YEAR),
-      cal.get(Calendar.MONTH) + 1,
+      cal.get(YEAR),
+      cal.get(MONTH) + 1,
       // The number of days will be added later to handle non-existing
       // Julian dates in Proleptic Gregorian calendar.
       // For example, 1000-02-29 exists in Julian calendar because 1000
       // is a leap year but it is not a leap year in Gregorian calendar.
       1,
-      cal.get(Calendar.HOUR_OF_DAY),
-      cal.get(Calendar.MINUTE),
-      cal.get(Calendar.SECOND),
+      cal.get(HOUR_OF_DAY),
+      cal.get(MINUTE),
+      cal.get(SECOND),
       (Math.floorMod(micros, MICROS_PER_SECOND) * NANOS_PER_MICROS).toInt)
-      .`with`(ChronoField.ERA, cal.get(Calendar.ERA))
-      .plusDays(cal.get(Calendar.DAY_OF_MONTH) - 1)
+      .`with`(ChronoField.ERA, cal.get(ERA))
+      .plusDays(cal.get(DAY_OF_MONTH) - 1)
     val zonedDateTime = localDateTime.atZone(zoneId)
-    // Assuming the daylight saving switchover time is 2:00, the local clock will go back to
-    // 2:00 after hitting 2:59. This means the local time between [2:00, 3:00) appears twice, and
-    // can map to two different physical times (seconds from the UTC epoch).
-    // Java 8 time API resolves the ambiguity by picking the earlier physical time. This is the same
-    // as Java 7 time API, except for 2:00 where Java 7 picks the later physical time.
-    // Here we detect the "2:00" case and pick the latter physical time, to be compatible with the
-    // Java 7 date-time.
-    val adjustedZdt = if (cal.get(Calendar.DST_OFFSET) == 0) {
-      zonedDateTime.withLaterOffsetAtOverlap()
-    } else {
-      zonedDateTime
-    }
+    // In the case of local timestamp overlapping, we need to choose the correct time instant
+    // which is related to the original local timestamp. We look ahead of 1 day, and if the next
+    // date has the same standard zone and DST offsets, the current local timestamp belongs to
+    // the period after the transition. In that case, we take the later zoned date time otherwise
+    // earlier one. Here, we assume that transitions happen not often than once per day.
+    val trans = zoneId.getRules.getTransition(localDateTime)
+    val adjustedZdt = if (trans != null && trans.isOverlap) {
+      val dstOffset = cal.get(DST_OFFSET)
+      val zoneOffset = cal.get(ZONE_OFFSET)
+      cal.add(DAY_OF_MONTH, 1)
+      if (zoneOffset == cal.get(ZONE_OFFSET) && dstOffset == cal.get(DST_OFFSET)) {
+        zonedDateTime.withLaterOffsetAtOverlap()
+      } else {
+        zonedDateTime.withEarlierOffsetAtOverlap()
+      }
+    } else zonedDateTime
     instantToMicros(adjustedZdt.toInstant)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
index 6ecdb05..971bad1 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
@@ -445,6 +445,9 @@ class RebaseDateTimeSuite extends SparkFunSuite with Matchers with SQLHelper {
       // Check reverse rebasing
       assert(rebaseJulianToGregorianMicros(rebasedEarlierMicros) === earlierMicros)
       assert(rebaseJulianToGregorianMicros(rebasedLaterMicros) === laterMicros)
+      // Check reverse not-optimized rebasing
+      assert(rebaseJulianToGregorianMicros(hkZid, rebasedEarlierMicros) === earlierMicros)
+      assert(rebaseJulianToGregorianMicros(hkZid, rebasedLaterMicros) === laterMicros)
     }
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org