You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ma...@apache.org on 2022/08/16 08:54:30 UTC
[spark] branch branch-3.3 updated: [SPARK-39184][SQL] Handle undersized result array in date and timestamp sequences

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
     new 21acaaea662 [SPARK-39184][SQL] Handle undersized result array in date and timestamp sequences
21acaaea662 is described below

commit 21acaaea662d003bc49861eee27d6d663618fb19
Author: Bruce Robbins <be...@gmail.com>
AuthorDate: Tue Aug 16 11:53:39 2022 +0300

    [SPARK-39184][SQL] Handle undersized result array in date and timestamp sequences
    
    ### What changes were proposed in this pull request?
    
    Add code to defensively check if the pre-allocated result array is big enough to handle the next element in a date or timestamp sequence.
    
    ### Why are the changes needed?
    
    `InternalSequenceBase.getSequenceLength` is a fast method for estimating the size of the result array. It uses an estimated step size in micros which is not always entirely accurate for the date/time/time-zone combination. As a result, `getSequenceLength` occasionally overestimates the size of the result array and also occasionally underestimates the size of the result array.
    
    `getSequenceLength` sometimes overestimates the size of the result array when the step size is in months (because `InternalSequenceBase` assumes 28 days per month). This case is handled: `InternalSequenceBase` will slice the array, if needed.
    
    `getSequenceLength` sometimes underestimates the size of the result array when the sequence crosses a DST "spring forward" without a corresponding "fall backward". This case is not handled (thus, this PR).
    
    For example:
    ```
    select sequence(
      timestamp'2022-03-13 00:00:00',
      timestamp'2022-03-14 00:00:00',
      interval 1 day) as x;
    ```
    In the America/Los_Angeles time zone, this results in the following error:
    ```
    java.lang.ArrayIndexOutOfBoundsException: 1
            at scala.runtime.ScalaRunTime$.array_update(ScalaRunTime.scala:77)
    ```
    This happens because `InternalSequenceBase` calculates an estimated step size of 24 hours. If you add 24 hours to 2022-03-13 00:00:00 in the America/Los_Angeles time zone, you get 2022-03-14 01:00:00 (because 2022-03-13 has only 23 hours due to "spring forward"). Since 2022-03-14 01:00:00 is later than the specified stop value, `getSequenceLength` assumes the stop value is not included in the result. Therefore, `getSequenceLength` estimates an array size of 1.
    
    However, when actually creating the sequence, `InternalSequenceBase` does not use a step of 24 hours, but of 1 day. When you add 1 day to 2022-03-13 00:00:00, you get 2022-03-14 00:00:00. Now the stop value *is* included, and we overrun the end of the result array.
    
    The new unit test includes examples of problematic date sequences.
    
    This PR adds code to to handle the underestimation case: it checks if we're about to overrun the array, and if so, gets a new array that's larger by 1.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    New unit test.
    
    Closes #37513 from bersprockets/date_sequence_array_size_issue.
    
    Authored-by: Bruce Robbins <be...@gmail.com>
    Signed-off-by: Max Gekk <ma...@gmail.com>
    (cherry picked from commit 3a1136aa05dd5e16de81c7ec804416b3498ca967)
    Signed-off-by: Max Gekk <ma...@gmail.com>
---
 .../expressions/collectionOperations.scala         | 13 +++++--
 .../expressions/CollectionExpressionsSuite.scala   | 40 ++++++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 650cfc7bca8..4a5ae5d2e02 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -3055,17 +3055,23 @@ object Sequence {
         val startMicros: Long = toMicros(num.toLong(start), scale)
         val stopMicros: Long = toMicros(num.toLong(stop), scale)
 
-        val maxEstimatedArrayLength =
+        val estimatedArrayLength =
           getSequenceLength(startMicros, stopMicros, input3, intervalStepInMicros)
 
         val stepSign = if (intervalStepInMicros > 0) +1 else -1
         val exclusiveItem = stopMicros + stepSign
-        val arr = new Array[T](maxEstimatedArrayLength)
+        var arr = new Array[T](estimatedArrayLength)
         var t = startMicros
         var i = 0
 
         while (t < exclusiveItem ^ stepSign < 0) {
           val result = fromMicros(t, scale)
+          // if we've underestimated the size of the array, due to crossing a DST
+          // "spring forward" without a corresponding "fall back", make a copy
+          // that's larger by 1
+          if (i == arr.length) {
+            arr = arr.padTo(estimatedArrayLength + 1, fromLong(0L))
+          }
           arr(i) = fromLong(result)
           i += 1
           t = addInterval(startMicros, i * stepMonths, i * stepDays, i * stepMicros, zoneId)
@@ -3173,6 +3179,9 @@ object Sequence {
          |  int $i = 0;
          |
          |  while ($t < $exclusiveItem ^ $stepSign < 0) {
+         |    if ($i == $arr.length) {
+         |      $arr = java.util.Arrays.copyOf($arr, $i + 1);
+         |    }
          |    $arr[$i] = $fromMicrosCode;
          |    $i += 1;
          |    $t = $addIntervalCode(
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 0ac1adccb8b..802988038a6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -2492,4 +2492,44 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
       Literal.create(Seq(Double.NaN, 1d, 2d, null), ArrayType(DoubleType))),
       Seq(null, 1d, 2d, Double.NaN))
   }
+
+  test("SPARK-39184: Avoid ArrayIndexOutOfBoundsException when crossing DST boundary") {
+    DateTimeTestUtils.withDefaultTimeZone(LA) {
+      checkEvaluation(new Sequence(
+        Literal(Timestamp.valueOf("2016-03-13 00:00:00")),
+        Literal(Timestamp.valueOf("2016-03-14 00:00:00")),
+        Literal(stringToInterval("interval 1 day"))),
+        Seq(
+          Timestamp.valueOf("2016-03-13 00:00:00"),
+          Timestamp.valueOf("2016-03-14 00:00:00")))
+
+      checkEvaluation(new Sequence(
+        Literal(Timestamp.valueOf("2016-03-14 00:00:00")),
+        Literal(Timestamp.valueOf("2016-03-13 00:00:00")),
+        Literal(stringToInterval("interval -1 days"))),
+        Seq(
+          Timestamp.valueOf("2016-03-14 00:00:00"),
+          Timestamp.valueOf("2016-03-13 00:00:00")))
+
+      checkEvaluation(new Sequence(
+        Literal(Date.valueOf("2016-03-13")),
+        Literal(Date.valueOf("2016-03-16")),
+        Literal(stringToInterval("interval 1 day 12 hour"))),
+        Seq(
+          Date.valueOf("2016-03-13"),
+          Date.valueOf("2016-03-14"),
+          Date.valueOf("2016-03-16")))
+
+      checkEvaluation(new Sequence(
+        Literal(Date.valueOf("2017-04-06")),
+        Literal(Date.valueOf("2017-02-12")),
+        Literal(stringToInterval("interval -13 days -6 hours"))),
+        Seq(
+          Date.valueOf("2017-04-06"),
+          Date.valueOf("2017-03-23"),
+          Date.valueOf("2017-03-10"),
+          Date.valueOf("2017-02-25"),
+          Date.valueOf("2017-02-12")))
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org