You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2023/03/03 06:39:03 UTC

[spark] branch master updated: [SPARK-42635][SQL] Fix the TimestampAdd expression

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 392bdc15958 [SPARK-42635][SQL] Fix the TimestampAdd expression
392bdc15958 is described below

commit 392bdc1595879ea03c90e3f4b550aa5ce7b32bdf
Author: Chenhao Li <ch...@databricks.com>
AuthorDate: Fri Mar 3 09:38:44 2023 +0300

    [SPARK-42635][SQL] Fix the TimestampAdd expression
    
    ### What changes were proposed in this pull request?
    This PR fixed the counter-intuitive behaviors of the `TimestampAdd` expression mentioned in https://issues.apache.org/jira/browse/SPARK-42635. See the following *user-facing* changes for details.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes. This PR fixes the three problems mentioned in SPARK-42635:
    
    1. When the time is close to daylight saving time transition, the result may be discontinuous and not monotonic.
    2. Adding month, quarter, and year silently ignores `Int` overflow during unit conversion.
    3. Adding sub-month units (week, day, hour, minute, second, millisecond, microsecond)silently ignores `Long` overflow during unit conversion.
    
    Some examples of the result changes:
    
    Old results:
    
    ```
    // In America/Los_Angeles timezone:
    timestampadd(DAY, 1, 2011-03-12 03:00:00) = 2011-03-13 03:00:00 (this is correct, put it here for comparison)
    timestampadd(HOUR, 23, 2011-03-12 03:00:00) = 2011-03-13 03:00:00
    timestampadd(HOUR, 24, 2011-03-12 03:00:00) = 2011-03-13 03:00:00
    timestampadd(SECOND, 86400 - 1, 2011-03-12 03:00:00) = 2011-03-13 03:59:59
    timestampadd(SECOND, 86400, 2011-03-12 03:00:00) = 2011-03-13 03:00:00
    // In UTC timezone:
    timestampadd(quarter, 1431655764, 1970-01-01 00:00:00) = 1969-09-01 00:00:00
    timestampadd(day, 106751992, 1970-01-01 00:00:00) = -290308-12-22 15:58:10.448384
    ```
    
    New results:
    
    ```
    // In America/Los_Angeles timezone:
    timestampadd(DAY, 1, 2011-03-12 03:00:00) = 2011-03-13 03:00:00
    timestampadd(HOUR, 23, 2011-03-12 03:00:00) = 2011-03-13 03:00:00
    timestampadd(HOUR, 24, 2011-03-12 03:00:00) = 2011-03-13 04:00:00
    timestampadd(SECOND, 86400 - 1, 2011-03-12 03:00:00) = 2011-03-13 03:59:59
    timestampadd(SECOND, 86400, 2011-03-12 03:00:00) = 2011-03-13 04:00:00
    // In UTC timezone:
    timestampadd(quarter, 1431655764, 1970-01-01 00:00:00) = throw overflow exception
    timestampadd(day, 106751992, 1970-01-01 00:00:00) = throw overflow exception
    ```
    
    ### How was this patch tested?
    
    Pass existing tests and some new tests.
    
    Closes #40237 from chenhao-db/SPARK-42635.
    
    Authored-by: Chenhao Li <ch...@databricks.com>
    Signed-off-by: Max Gekk <ma...@gmail.com>
---
 .../spark/sql/catalyst/util/DateTimeUtils.scala    | 22 +++--
 .../expressions/DateExpressionsSuite.scala         | 98 +++++++++++++++++++++-
 2 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index af0666a98fa..2ae11f6568a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -1227,25 +1227,29 @@ object DateTimeUtils {
     try {
       unit.toUpperCase(Locale.ROOT) match {
         case "MICROSECOND" =>
-          timestampAddDayTime(micros, quantity, zoneId)
+          timestampAddInterval(micros, 0, 0, quantity, zoneId)
         case "MILLISECOND" =>
-          timestampAddDayTime(micros, quantity * MICROS_PER_MILLIS, zoneId)
+          timestampAddInterval(micros, 0, 0,
+            Math.multiplyExact(quantity.toLong, MICROS_PER_MILLIS), zoneId)
         case "SECOND" =>
-          timestampAddDayTime(micros, quantity * MICROS_PER_SECOND, zoneId)
+          timestampAddInterval(micros, 0, 0,
+            Math.multiplyExact(quantity.toLong, MICROS_PER_SECOND), zoneId)
         case "MINUTE" =>
-          timestampAddDayTime(micros, quantity * MICROS_PER_MINUTE, zoneId)
+          timestampAddInterval(micros, 0, 0,
+            Math.multiplyExact(quantity.toLong, MICROS_PER_MINUTE), zoneId)
         case "HOUR" =>
-          timestampAddDayTime(micros, quantity * MICROS_PER_HOUR, zoneId)
+          timestampAddInterval(micros, 0, 0,
+            Math.multiplyExact(quantity.toLong, MICROS_PER_HOUR), zoneId)
         case "DAY" | "DAYOFYEAR" =>
-          timestampAddDayTime(micros, quantity * MICROS_PER_DAY, zoneId)
+          timestampAddInterval(micros, 0, quantity, 0, zoneId)
         case "WEEK" =>
-          timestampAddDayTime(micros, quantity * MICROS_PER_DAY * DAYS_PER_WEEK, zoneId)
+          timestampAddInterval(micros, 0, Math.multiplyExact(quantity, DAYS_PER_WEEK), 0, zoneId)
         case "MONTH" =>
           timestampAddMonths(micros, quantity, zoneId)
         case "QUARTER" =>
-          timestampAddMonths(micros, quantity * 3, zoneId)
+          timestampAddMonths(micros, Math.multiplyExact(quantity, 3), zoneId)
         case "YEAR" =>
-          timestampAddMonths(micros, quantity * MONTHS_PER_YEAR, zoneId)
+          timestampAddMonths(micros, Math.multiplyExact(quantity, MONTHS_PER_YEAR), zoneId)
       }
     } catch {
       case _: scala.MatchError =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index e39d1518990..d2010102690 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -28,7 +28,7 @@ import scala.language.postfixOps
 import scala.reflect.ClassTag
 import scala.util.Random
 
-import org.apache.spark.{SparkDateTimeException, SparkFunSuite, SparkUpgradeException}
+import org.apache.spark.{SparkArithmeticException, SparkDateTimeException, SparkFunSuite, SparkUpgradeException}
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
 import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils, TimestampFormatter}
@@ -1961,6 +1961,102 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     }
   }
 
+  test("SPARK-42635: timestampadd near daylight saving transition") {
+    // In America/Los_Angeles timezone, timestamp value `skippedTime` is 2011-03-13 03:00:00.
+    // The next second of 2011-03-13 01:59:59 jumps to 2011-03-13 03:00:00.
+    val skippedTime = 1300010400000000L
+    // In America/Los_Angeles timezone, both timestamp range `[repeatedTime - MICROS_PER_HOUR,
+    // repeatedTime)` and `[repeatedTime, repeatedTime + MICROS_PER_HOUR)` map to
+    // [2011-11-06 01:00:00, 2011-11-06 02:00:00).
+    // The next second of 2011-11-06 01:59:59 (pre-transition) jumps back to 2011-11-06 01:00:00.
+    val repeatedTime = 1320570000000000L
+    withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> LA.getId) {
+      // Adding one day is **not** equivalent to adding <unit>_PER_DAY time units, because not every
+      // day has 24 hours: 2011-03-13 has 23 hours, 2011-11-06 has 25 hours.
+
+      // timestampadd(DAY, 1, 2011-03-12 03:00:00) = 2011-03-13 03:00:00
+      checkEvaluation(
+        TimestampAdd("DAY", Literal(1), Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)),
+        skippedTime)
+      // timestampadd(HOUR, 24, 2011-03-12 03:00:00) = 2011-03-13 04:00:00
+      checkEvaluation(
+        TimestampAdd("HOUR", Literal(24),
+          Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)),
+        skippedTime + MICROS_PER_HOUR)
+      // timestampadd(HOUR, 23, 2011-03-12 03:00:00) = 2011-03-13 03:00:00
+      checkEvaluation(
+        TimestampAdd("HOUR", Literal(23),
+          Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)),
+        skippedTime)
+      // timestampadd(SECOND, SECONDS_PER_DAY, 2011-03-12 03:00:00) = 2011-03-13 04:00:00
+      checkEvaluation(
+        TimestampAdd(
+          "SECOND", Literal(SECONDS_PER_DAY.toInt),
+          Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)),
+        skippedTime + MICROS_PER_HOUR)
+      // timestampadd(SECOND, SECONDS_PER_DAY, 2011-03-12 03:00:00) = 2011-03-13 03:59:59
+      checkEvaluation(
+        TimestampAdd(
+          "SECOND", Literal(SECONDS_PER_DAY.toInt - 1),
+          Literal(skippedTime - 23 * MICROS_PER_HOUR, TimestampType)),
+        skippedTime + MICROS_PER_HOUR - MICROS_PER_SECOND)
+
+      // timestampadd(DAY, 1, 2011-11-05 02:00:00) = 2011-11-06 02:00:00
+      checkEvaluation(
+        TimestampAdd("DAY", Literal(1),
+          Literal(repeatedTime - 24 * MICROS_PER_HOUR, TimestampType)),
+        repeatedTime + MICROS_PER_HOUR)
+      // timestampadd(DAY, 1, 2011-11-05 01:00:00) = 2011-11-06 01:00:00 (pre-transition)
+      checkEvaluation(
+        TimestampAdd("DAY", Literal(1),
+          Literal(repeatedTime - 25 * MICROS_PER_HOUR, TimestampType)),
+        repeatedTime - MICROS_PER_HOUR)
+      // timestampadd(DAY, -1, 2011-11-07 01:00:00) = 2011-11-06 01:00:00 (post-transition)
+      checkEvaluation(
+        TimestampAdd("DAY", Literal(-1),
+          Literal(repeatedTime + 24 * MICROS_PER_HOUR, TimestampType)),
+        repeatedTime)
+      // timestampadd(MONTH, 1, 2011-10-06 01:00:00) = 2011-11-06 01:00:00 (pre-transition)
+      checkEvaluation(
+        TimestampAdd(
+          "MONTH", Literal(1),
+          Literal(repeatedTime - MICROS_PER_HOUR - 31 * MICROS_PER_DAY, TimestampType)),
+        repeatedTime - MICROS_PER_HOUR)
+      // timestampadd(MONTH, -1, 2011-12-06 01:00:00) = 2011-11-06 01:00:00 (post-transition)
+      checkEvaluation(
+        TimestampAdd(
+          "MONTH", Literal(-1),
+          Literal(repeatedTime + 30 * MICROS_PER_DAY, TimestampType)),
+        repeatedTime)
+      // timestampadd(HOUR, 23, 2011-11-05 02:00:00) = 2011-11-06 01:00:00 (pre-transition)
+      checkEvaluation(
+        TimestampAdd("HOUR", Literal(23),
+          Literal(repeatedTime - 24 * MICROS_PER_HOUR, TimestampType)),
+        repeatedTime - MICROS_PER_HOUR)
+      // timestampadd(HOUR, 24, 2011-11-05 02:00:00) = 2011-11-06 01:00:00 (post-transition)
+      checkEvaluation(
+        TimestampAdd("HOUR", Literal(24),
+          Literal(repeatedTime - 24 * MICROS_PER_HOUR, TimestampType)),
+        repeatedTime)
+    }
+  }
+
+  test("SPARK-42635: timestampadd unit conversion overflow") {
+    withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> "UTC") {
+      checkErrorInExpression[SparkArithmeticException](TimestampAdd("DAY",
+        Literal(106751992),
+        Literal(0L, TimestampType)),
+        errorClass = "DATETIME_OVERFLOW",
+        parameters = Map("operation" -> "add 106751992 DAY to TIMESTAMP '1970-01-01 00:00:00'"))
+      checkErrorInExpression[SparkArithmeticException](TimestampAdd("QUARTER",
+        Literal(1431655764),
+        Literal(0L, TimestampType)),
+        errorClass = "DATETIME_OVERFLOW",
+        parameters = Map("operation" ->
+          "add 1431655764 QUARTER to TIMESTAMP '1970-01-01 00:00:00'"))
+    }
+  }
+
   test("SPARK-38284: difference between two timestamps in units") {
     // Check case-insensitivity
     checkEvaluation(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org