You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/04/29 06:58:01 UTC
[spark] branch branch-3.0 updated: [SPARK-31597][SQL] extracting day from intervals should be interval.days + days in interval.microsecond

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new d771d79  [SPARK-31597][SQL] extracting day from intervals should be interval.days + days in interval.microsecond
d771d79 is described below

commit d771d79d0374a167630ebc3985035cfb39e3c990
Author: Kent Yao <ya...@hotmail.com>
AuthorDate: Wed Apr 29 06:56:33 2020 +0000

    [SPARK-31597][SQL] extracting day from intervals should be interval.days + days in interval.microsecond
    
    ### What changes were proposed in this pull request?
    
    With suggestion from cloud-fan https://github.com/apache/spark/pull/28222#issuecomment-620586933
    
    I Checked with both Presto and PostgresSQL, one is implemented intervals with ANSI style year-month/day-time, and the other is mixed and Non-ANSI. They both add the exceeded days in interval time part to the total days of the operation which extracts day from interval values.
    
    ```sql
    
    presto> SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:00' as timestamp)));
    _col0
    -------
    14
    (1 row)
    
    Query 20200428_135239_00000_ahn7x, FINISHED, 1 node
    Splits: 17 total, 17 done (100.00%)
    0:01 [0 rows, 0B] [0 rows/s, 0B/s]
    
    presto> SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:01' as timestamp)));
    _col0
    -------
    13
    (1 row)
    
    Query 20200428_135246_00001_ahn7x, FINISHED, 1 node
    Splits: 17 total, 17 done (100.00%)
    0:00 [0 rows, 0B] [0 rows/s, 0B/s]
    
    presto>
    
    ```
    
    ```sql
    
    postgres=# SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:00' as timestamp)));
    date_part
    -----------
    14
    (1 row)
    
    postgres=# SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:01' as timestamp)));
    date_part
    -----------
    13
    
    ```
    
    ```
    spark-sql> SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:01' as timestamp)));
    0
    spark-sql> SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:00' as timestamp)));
    0
    ```
    
    In ANSI standard, the day is exact 24 hours, so we don't need to worry about the conceptual day for interval extraction. The meaning of the conceptual day only takes effect when we add it to a zoned timestamp value.
    
    ### Why are the changes needed?
    
    Both satisfy the ANSI standard and common use cases in modern SQL platforms
    
    ### Does this PR introduce any user-facing change?
    
    No, it new in 3.0
    ### How was this patch tested?
    
    add more uts
    
    Closes #28396 from yaooqinn/SPARK-31597.
    
    Authored-by: Kent Yao <ya...@hotmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit ea525fe8c0cc6336a7ba8d98bada3198795f8aed)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../spark/sql/catalyst/util/IntervalUtils.scala    |  6 ++--
 .../expressions/IntervalExpressionsSuite.scala     |  3 ++
 .../resources/sql-tests/results/extract.sql.out    | 32 +++++++++++-----------
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
index 3afa0c7..833942d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.util
 
-import java.math.BigDecimal
 import java.util.concurrent.TimeUnit
 
 import scala.util.control.NonFatal
@@ -55,11 +54,12 @@ object IntervalUtils {
   }
 
   def getDays(interval: CalendarInterval): Int = {
-    interval.days
+    val daysInMicroseconds = (interval.microseconds / MICROS_PER_DAY).toInt
+    Math.addExact(interval.days, daysInMicroseconds)
   }
 
   def getHours(interval: CalendarInterval): Long = {
-    interval.microseconds / MICROS_PER_HOUR
+    (interval.microseconds % MICROS_PER_DAY) / MICROS_PER_HOUR
   }
 
   def getMinutes(interval: CalendarInterval): Byte = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
index a2ee1c6..8c972a9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
@@ -68,6 +68,7 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // Years and months must not be taken into account
     checkEvaluation(ExtractIntervalDays("100 year 10 months 5 days"), 5)
     checkEvaluation(ExtractIntervalDays(largeInterval), 31)
+    checkEvaluation(ExtractIntervalDays("25 hours"), 1)
   }
 
   test("hours") {
@@ -81,6 +82,8 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // Minutes should be taken into account
     checkEvaluation(ExtractIntervalHours("10 hours 100 minutes"), 11L)
     checkEvaluation(ExtractIntervalHours(largeInterval), 11L)
+    checkEvaluation(ExtractIntervalHours("25 hours"), 1L)
+
   }
 
   test("minutes") {
diff --git a/sql/core/src/test/resources/sql-tests/results/extract.sql.out b/sql/core/src/test/resources/sql-tests/results/extract.sql.out
index aea09e4..29cbefd 100644
--- a/sql/core/src/test/resources/sql-tests/results/extract.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/extract.sql.out
@@ -135,7 +135,7 @@ select extract(day from c), extract(day from i) from t
 -- !query schema
 struct<extract('day' FROM t.`c`):int,extract('day' FROM t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -143,7 +143,7 @@ select extract(d from c), extract(d from i) from t
 -- !query schema
 struct<extract('d' FROM t.`c`):int,extract('d' FROM t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -151,7 +151,7 @@ select extract(days from c), extract(days from i) from t
 -- !query schema
 struct<extract('days' FROM t.`c`):int,extract('days' FROM t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -199,7 +199,7 @@ select extract(hour from c), extract(hour from i) from t
 -- !query schema
 struct<extract('hour' FROM t.`c`):int,extract('hour' FROM t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -207,7 +207,7 @@ select extract(h from c), extract(h from i) from t
 -- !query schema
 struct<extract('h' FROM t.`c`):int,extract('h' FROM t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -215,7 +215,7 @@ select extract(hours from c), extract(hours from i) from t
 -- !query schema
 struct<extract('hours' FROM t.`c`):int,extract('hours' FROM t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -223,7 +223,7 @@ select extract(hr from c), extract(hr from i) from t
 -- !query schema
 struct<extract('hr' FROM t.`c`):int,extract('hr' FROM t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -231,7 +231,7 @@ select extract(hrs from c), extract(hrs from i) from t
 -- !query schema
 struct<extract('hrs' FROM t.`c`):int,extract('hrs' FROM t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -457,7 +457,7 @@ select date_part('day', c), date_part('day', i) from t
 -- !query schema
 struct<date_part('day', t.`c`):int,date_part('day', t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -465,7 +465,7 @@ select date_part('d', c), date_part('d', i) from t
 -- !query schema
 struct<date_part('d', t.`c`):int,date_part('d', t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -473,7 +473,7 @@ select date_part('days', c), date_part('days', i) from t
 -- !query schema
 struct<date_part('days', t.`c`):int,date_part('days', t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -521,7 +521,7 @@ select date_part('hour', c), date_part('hour', i) from t
 -- !query schema
 struct<date_part('hour', t.`c`):int,date_part('hour', t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -529,7 +529,7 @@ select date_part('h', c), date_part('h', i) from t
 -- !query schema
 struct<date_part('h', t.`c`):int,date_part('h', t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -537,7 +537,7 @@ select date_part('hours', c), date_part('hours', i) from t
 -- !query schema
 struct<date_part('hours', t.`c`):int,date_part('hours', t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -545,7 +545,7 @@ select date_part('hr', c), date_part('hr', i) from t
 -- !query schema
 struct<date_part('hr', t.`c`):int,date_part('hr', t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -553,7 +553,7 @@ select date_part('hrs', c), date_part('hrs', i) from t
 -- !query schema
 struct<date_part('hrs', t.`c`):int,date_part('hrs', t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org