You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/04/29 06:57:16 UTC
[spark] branch master updated: [SPARK-31597][SQL] extracting day from intervals should be interval.days + days in interval.microsecond

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new ea525fe  [SPARK-31597][SQL] extracting day from intervals should be interval.days + days in interval.microsecond
ea525fe is described below

commit ea525fe8c0cc6336a7ba8d98bada3198795f8aed
Author: Kent Yao <ya...@hotmail.com>
AuthorDate: Wed Apr 29 06:56:33 2020 +0000

    [SPARK-31597][SQL] extracting day from intervals should be interval.days + days in interval.microsecond
    
    ### What changes were proposed in this pull request?
    
    With suggestion from cloud-fan https://github.com/apache/spark/pull/28222#issuecomment-620586933
    
    I Checked with both Presto and PostgresSQL, one is implemented intervals with ANSI style year-month/day-time, and the other is mixed and Non-ANSI. They both add the exceeded days in interval time part to the total days of the operation which extracts day from interval values.
    
    ```sql
    
    presto> SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:00' as timestamp)));
    _col0
    -------
    14
    (1 row)
    
    Query 20200428_135239_00000_ahn7x, FINISHED, 1 node
    Splits: 17 total, 17 done (100.00%)
    0:01 [0 rows, 0B] [0 rows/s, 0B/s]
    
    presto> SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:01' as timestamp)));
    _col0
    -------
    13
    (1 row)
    
    Query 20200428_135246_00001_ahn7x, FINISHED, 1 node
    Splits: 17 total, 17 done (100.00%)
    0:00 [0 rows, 0B] [0 rows/s, 0B/s]
    
    presto>
    
    ```
    
    ```sql
    
    postgres=# SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:00' as timestamp)));
    date_part
    -----------
    14
    (1 row)
    
    postgres=# SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:01' as timestamp)));
    date_part
    -----------
    13
    
    ```
    
    ```
    spark-sql> SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:01' as timestamp)));
    0
    spark-sql> SELECT EXTRACT(DAY FROM (cast('2020-01-15 00:00:00' as timestamp) - cast('2020-01-01 00:00:00' as timestamp)));
    0
    ```
    
    In ANSI standard, the day is exact 24 hours, so we don't need to worry about the conceptual day for interval extraction. The meaning of the conceptual day only takes effect when we add it to a zoned timestamp value.
    
    ### Why are the changes needed?
    
    Both satisfy the ANSI standard and common use cases in modern SQL platforms
    
    ### Does this PR introduce any user-facing change?
    
    No, it new in 3.0
    ### How was this patch tested?
    
    add more uts
    
    Closes #28396 from yaooqinn/SPARK-31597.
    
    Authored-by: Kent Yao <ya...@hotmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../spark/sql/catalyst/util/IntervalUtils.scala    |  6 ++--
 .../expressions/IntervalExpressionsSuite.scala     |  3 ++
 .../resources/sql-tests/results/extract.sql.out    | 32 +++++++++++-----------
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
index c3f0e44..7e5a71e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/IntervalUtils.scala
@@ -17,7 +17,6 @@
 
 package org.apache.spark.sql.catalyst.util
 
-import java.math.BigDecimal
 import java.util.concurrent.TimeUnit
 
 import scala.util.control.NonFatal
@@ -55,11 +54,12 @@ object IntervalUtils {
   }
 
   def getDays(interval: CalendarInterval): Int = {
-    interval.days
+    val daysInMicroseconds = (interval.microseconds / MICROS_PER_DAY).toInt
+    Math.addExact(interval.days, daysInMicroseconds)
   }
 
   def getHours(interval: CalendarInterval): Long = {
-    interval.microseconds / MICROS_PER_HOUR
+    (interval.microseconds % MICROS_PER_DAY) / MICROS_PER_HOUR
   }
 
   def getMinutes(interval: CalendarInterval): Byte = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
index a2ee1c6..8c972a9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/IntervalExpressionsSuite.scala
@@ -68,6 +68,7 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // Years and months must not be taken into account
     checkEvaluation(ExtractIntervalDays("100 year 10 months 5 days"), 5)
     checkEvaluation(ExtractIntervalDays(largeInterval), 31)
+    checkEvaluation(ExtractIntervalDays("25 hours"), 1)
   }
 
   test("hours") {
@@ -81,6 +82,8 @@ class IntervalExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     // Minutes should be taken into account
     checkEvaluation(ExtractIntervalHours("10 hours 100 minutes"), 11L)
     checkEvaluation(ExtractIntervalHours(largeInterval), 11L)
+    checkEvaluation(ExtractIntervalHours("25 hours"), 1L)
+
   }
 
   test("minutes") {
diff --git a/sql/core/src/test/resources/sql-tests/results/extract.sql.out b/sql/core/src/test/resources/sql-tests/results/extract.sql.out
index aea09e4..29cbefd 100644
--- a/sql/core/src/test/resources/sql-tests/results/extract.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/extract.sql.out
@@ -135,7 +135,7 @@ select extract(day from c), extract(day from i) from t
 -- !query schema
 struct<extract('day' FROM t.`c`):int,extract('day' FROM t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -143,7 +143,7 @@ select extract(d from c), extract(d from i) from t
 -- !query schema
 struct<extract('d' FROM t.`c`):int,extract('d' FROM t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -151,7 +151,7 @@ select extract(days from c), extract(days from i) from t
 -- !query schema
 struct<extract('days' FROM t.`c`):int,extract('days' FROM t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -199,7 +199,7 @@ select extract(hour from c), extract(hour from i) from t
 -- !query schema
 struct<extract('hour' FROM t.`c`):int,extract('hour' FROM t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -207,7 +207,7 @@ select extract(h from c), extract(h from i) from t
 -- !query schema
 struct<extract('h' FROM t.`c`):int,extract('h' FROM t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -215,7 +215,7 @@ select extract(hours from c), extract(hours from i) from t
 -- !query schema
 struct<extract('hours' FROM t.`c`):int,extract('hours' FROM t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -223,7 +223,7 @@ select extract(hr from c), extract(hr from i) from t
 -- !query schema
 struct<extract('hr' FROM t.`c`):int,extract('hr' FROM t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -231,7 +231,7 @@ select extract(hrs from c), extract(hrs from i) from t
 -- !query schema
 struct<extract('hrs' FROM t.`c`):int,extract('hrs' FROM t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -457,7 +457,7 @@ select date_part('day', c), date_part('day', i) from t
 -- !query schema
 struct<date_part('day', t.`c`):int,date_part('day', t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -465,7 +465,7 @@ select date_part('d', c), date_part('d', i) from t
 -- !query schema
 struct<date_part('d', t.`c`):int,date_part('d', t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -473,7 +473,7 @@ select date_part('days', c), date_part('days', i) from t
 -- !query schema
 struct<date_part('days', t.`c`):int,date_part('days', t.`i`):int>
 -- !query output
-6	30
+6	31
 
 
 -- !query
@@ -521,7 +521,7 @@ select date_part('hour', c), date_part('hour', i) from t
 -- !query schema
 struct<date_part('hour', t.`c`):int,date_part('hour', t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -529,7 +529,7 @@ select date_part('h', c), date_part('h', i) from t
 -- !query schema
 struct<date_part('h', t.`c`):int,date_part('h', t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -537,7 +537,7 @@ select date_part('hours', c), date_part('hours', i) from t
 -- !query schema
 struct<date_part('hours', t.`c`):int,date_part('hours', t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -545,7 +545,7 @@ select date_part('hr', c), date_part('hr', i) from t
 -- !query schema
 struct<date_part('hr', t.`c`):int,date_part('hr', t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query
@@ -553,7 +553,7 @@ select date_part('hrs', c), date_part('hrs', i) from t
 -- !query schema
 struct<date_part('hrs', t.`c`):int,date_part('hrs', t.`i`):bigint>
 -- !query output
-7	40
+7	16
 
 
 -- !query


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org