You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/04/21 11:56:58 UTC

[spark] branch branch-3.0 updated: [SPARK-31474][SQL] Consistency between dayofweek/dow in extract exprsession and dayofweek function

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 2c4765c  [SPARK-31474][SQL] Consistency between dayofweek/dow in extract exprsession and dayofweek function
2c4765c is described below

commit 2c4765c3477a7e3d880b0ffbe83a9bb976b4b244
Author: Kent Yao <ya...@hotmail.com>
AuthorDate: Tue Apr 21 11:55:33 2020 +0000

    [SPARK-31474][SQL] Consistency between dayofweek/dow in extract exprsession and dayofweek function
    
    ### What changes were proposed in this pull request?
    ```sql
    spark-sql> SELECT extract(dayofweek from '2009-07-26');
    1
    spark-sql> SELECT extract(dow from '2009-07-26');
    0
    spark-sql> SELECT extract(isodow from '2009-07-26');
    7
    spark-sql> SELECT dayofweek('2009-07-26');
    1
    spark-sql> SELECT weekday('2009-07-26');
    6
    ```
    Currently, there are 4 types of day-of-week range:
    1. the function `dayofweek`(2.3.0) and extracting `dayofweek`(2.4.0) result as of Sunday(1) to Saturday(7)
    2. extracting `dow`(3.0.0) results as of Sunday(0) to Saturday(6)
    3. extracting` isodow` (3.0.0) results as of Monday(1) to Sunday(7)
    4. the function `weekday`(2.4.0) results as of Monday(0) to Sunday(6)
    
    Actually, extracting `dayofweek` and `dow` are both derived from PostgreSQL but have different meanings.
    https://issues.apache.org/jira/browse/SPARK-23903
    https://issues.apache.org/jira/browse/SPARK-28623
    
    In this PR, we make extracting `dow` as same as extracting `dayofweek` and the `dayofweek` function for historical reason and not breaking anything.
    
    Also, add more documentation to the extracting function to make extract field more clear to understand.
    
    ### Why are the changes needed?
    
    Consistency insurance
    
    ### Does this PR introduce any user-facing change?
    
    yes, doc updated and extract `dow` is as same as `dayofweek`
    
    ### How was this patch tested?
    
    1. modified ut
    2. local SQL doc verification
    #### before
    ![image](https://user-images.githubusercontent.com/8326978/79601949-3535b100-811c-11ea-957b-a33d68641181.png)
    
    #### after
    ![image](https://user-images.githubusercontent.com/8326978/79601847-12a39800-811c-11ea-8ff6-aa329255d099.png)
    
    Closes #28248 from yaooqinn/SPARK-31474.
    
    Authored-by: Kent Yao <ya...@hotmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit 19854371104451b5c4cb266aaee6b33a5049b145)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../catalyst/expressions/datetimeExpressions.scala | 74 ++++++++++++----------
 .../resources/sql-tests/results/date_part.sql.out  |  2 +-
 .../resources/sql-tests/results/extract.sql.out    |  2 +-
 .../sql-tests/results/postgreSQL/timestamp.sql.out | 10 +--
 4 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 0160827..a36c1f2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
-import java.sql.Timestamp
 import java.time.{DateTimeException, LocalDate, LocalDateTime, ZoneId}
 import java.time.temporal.IsoFields
-import java.util.{Locale, TimeZone}
+import java.util.Locale
 
 import scala.util.control.NonFatal
 
@@ -2118,8 +2117,7 @@ object DatePart {
     case "MONTH" | "MON" | "MONS" | "MONTHS" => Month(source)
     case "WEEK" | "W" | "WEEKS" => WeekOfYear(source)
     case "DAY" | "D" | "DAYS" => DayOfMonth(source)
-    case "DAYOFWEEK" => DayOfWeek(source)
-    case "DOW" => Subtract(DayOfWeek(source), Literal(1))
+    case "DAYOFWEEK" | "DOW" => DayOfWeek(source)
     case "ISODOW" => Add(WeekDay(source), Literal(1))
     case "DOY" => DayOfYear(source)
     case "HOUR" | "H" | "HOURS" | "HR" | "HRS" => Hour(source)
@@ -2159,38 +2157,12 @@ object DatePartLike {
   }
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(field, source) - Extracts a part of the date/timestamp or interval source.",
   arguments = """
     Arguments:
-      * field - selects which part of the source should be extracted.
-               Supported string values of `field` for dates and timestamps are:
-                ["MILLENNIUM", ("MILLENNIA", "MIL", "MILS"),
-                 "CENTURY", ("CENTURIES", "C", "CENT"),
-                 "DECADE", ("DECADES", "DEC", "DECS"),
-                 "YEAR", ("Y", "YEARS", "YR", "YRS"),
-                 "ISOYEAR",
-                 "QUARTER", ("QTR"),
-                 "MONTH", ("MON", "MONS", "MONTHS"),
-                 "WEEK", ("W", "WEEKS"),
-                 "DAY", ("D", "DAYS"),
-                 "DAYOFWEEK",
-                 "DOW",
-                 "ISODOW",
-                 "DOY",
-                 "HOUR", ("H", "HOURS", "HR", "HRS"),
-                 "MINUTE", ("M", "MIN", "MINS", "MINUTES"),
-                 "SECOND", ("S", "SEC", "SECONDS", "SECS"),
-                 "MILLISECONDS", ("MSEC", "MSECS", "MILLISECON", "MSECONDS", "MS"),
-                 "MICROSECONDS", ("USEC", "USECS", "USECONDS", "MICROSECON", "US"),
-                 "EPOCH"]
-                Supported string values of `field` for intervals are:
-                 ["YEAR", ("Y", "YEARS", "YR", "YRS"),
-                  "MONTH", ("MON", "MONS", "MONTHS"),
-                  "DAY", ("D", "DAYS"),
-                  "HOUR", ("H", "HOURS", "HR", "HRS"),
-                  "MINUTE", ("M", "MIN", "MINS", "MINUTES"),
-                  "SECOND", ("S", "SEC", "SECONDS", "SECS")]
+      * field - selects which part of the source should be extracted, and supported string values are as same as the fields of the equivalent function `EXTRACT`.
       * source - a date/timestamp or interval column from where `field` should be extracted
   """,
   examples = """
@@ -2208,8 +2180,12 @@ object DatePartLike {
       > SELECT _FUNC_('seconds', interval 5 hours 30 seconds 1 milliseconds 1 microseconds);
        30.001001
   """,
+  note = """
+    The _FUNC_ function is equivalent to the SQL-standard function `EXTRACT(field FROM source)`
+  """,
   group = "datetime_funcs",
   since = "3.0.0")
+// scalastyle:on line.size.limit
 case class DatePart(field: Expression, source: Expression, child: Expression)
   extends RuntimeReplaceable {
 
@@ -2222,12 +2198,38 @@ case class DatePart(field: Expression, source: Expression, child: Expression)
   override def prettyName: String = "date_part"
 }
 
+// scalastyle:off line.size.limit
 @ExpressionDescription(
   usage = "_FUNC_(field FROM source) - Extracts a part of the date/timestamp or interval source.",
   arguments = """
     Arguments:
-      * field - selects which part of the source should be extracted and supported string values
-                are the same with the `date_part` fields.
+      * field - selects which part of the source should be extracted
+          - Supported string values of `field` for dates and timestamps are:
+              - "MILLENNIUM", ("MILLENNIA", "MIL", "MILS") - the conventional numbering of millennia
+              - "CENTURY", ("CENTURIES", "C", "CENT") - the conventional numbering of centuries
+              - "DECADE", ("DECADES", "DEC", "DECS") - the year field divided by 10
+              - "YEAR", ("Y", "YEARS", "YR", "YRS") - the year field
+              - "ISOYEAR" - the ISO 8601 week-numbering year that the datetime falls in
+              - "QUARTER", ("QTR") - the quarter (1 - 4) of the year that the datetime falls in
+              - "MONTH", ("MON", "MONS", "MONTHS") - the month field (1 - 12)
+              - "WEEK", ("W", "WEEKS") - the number of the ISO 8601 week-of-week-based-year. A week is considered to start on a Monday and week 1 is the first week with >3 days. In the ISO week-numbering system, it is possible for early-January dates to be part of the 52nd or 53rd week of the previous year, and for late-December dates to be part of the first week of the next year. For example, 2005-01-02 is part of the 53rd week of year 2004, while 2012-12-31 is part of the first week of 2013
+              - "DAY", ("D", "DAYS") - the day of the month field (1 - 31)
+              - "DAYOFWEEK",("DOW") - the day of the week for datetime as Sunday(1) to Saturday(7)
+              - "ISODOW" - ISO 8601 based day of the week for datetime as Monday(1) to Sunday(7)
+              - "DOY" - the day of the year (1 - 365/366)
+              - "HOUR", ("H", "HOURS", "HR", "HRS") - The hour field (0 - 23)
+              - "MINUTE", ("M", "MIN", "MINS", "MINUTES") - the minutes field (0 - 59)
+              - "SECOND", ("S", "SEC", "SECONDS", "SECS") - the seconds field, including fractional parts
+              - "MILLISECONDS", ("MSEC", "MSECS", "MILLISECON", "MSECONDS", "MS") - the seconds field, including fractional parts, multiplied by 1000. Note that this includes full seconds
+              - "MICROSECONDS", ("USEC", "USECS", "USECONDS", "MICROSECON", "US") - The seconds field, including fractional parts, multiplied by 1000000. Note that this includes full seconds
+              - "EPOCH" - the number of seconds with fractional part in microsecond precision since 1970-01-01 00:00:00 local time (can be negative)
+          - Supported string values of `field` for interval(which consists of `months`, `days`, `microseconds`) are:
+              - "YEAR", ("Y", "YEARS", "YR", "YRS") - the total `months` / 12
+              - "MONTH", ("MON", "MONS", "MONTHS") - the total `months` % 12
+              - "DAY", ("D", "DAYS") - the `days` part of interval
+              - "HOUR", ("H", "HOURS", "HR", "HRS") - how many hours the `microseconds` contains
+              - "MINUTE", ("M", "MIN", "MINS", "MINUTES") - how many minutes left after taking hours from `microseconds`
+              - "SECOND", ("S", "SEC", "SECONDS", "SECS") - how many second with fractions left after taking hours and minutes from `microseconds`
       * source - a date/timestamp or interval column from where `field` should be extracted
   """,
   examples = """
@@ -2245,7 +2247,11 @@ case class DatePart(field: Expression, source: Expression, child: Expression)
       > SELECT _FUNC_(seconds FROM interval 5 hours 30 seconds 1 milliseconds 1 microseconds);
        30.001001
   """,
+  note = """
+    The _FUNC_ function is equivalent to `date_part(field, source)`.
+  """,
   since = "3.0.0")
+// scalastyle:on line.size.limit
 case class Extract(field: Expression, source: Expression, child: Expression)
   extends RuntimeReplaceable {
 
diff --git a/sql/core/src/test/resources/sql-tests/results/date_part.sql.out b/sql/core/src/test/resources/sql-tests/results/date_part.sql.out
index 702ac17..028448b 100644
--- a/sql/core/src/test/resources/sql-tests/results/date_part.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/date_part.sql.out
@@ -255,7 +255,7 @@ select date_part('dow', c) from t
 -- !query schema
 struct<date_part('dow', t.`c`):int>
 -- !query output
-5
+6
 
 
 -- !query
diff --git a/sql/core/src/test/resources/sql-tests/results/extract.sql.out b/sql/core/src/test/resources/sql-tests/results/extract.sql.out
index 47cd518..e6635e608 100644
--- a/sql/core/src/test/resources/sql-tests/results/extract.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/extract.sql.out
@@ -263,7 +263,7 @@ select extract(dow from c) from t
 -- !query schema
 struct<extract('dow' FROM t.`c`):int>
 -- !query output
-5
+6
 
 
 -- !query
diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/timestamp.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/timestamp.sql.out
index 75ea3f3..abfce91 100644
--- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/timestamp.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/timestamp.sql.out
@@ -288,11 +288,11 @@ SELECT '' AS `54`, d1 as `timestamp`,
 -- !query schema
 struct<54:string,timestamp:timestamp,isoyear:int,week:int,dow:int>
 -- !query output
-	1969-12-31 16:00:00	1970	1	3
-	1997-01-02 00:00:00	1997	1	4
-	1997-01-02 03:04:05	1997	1	4
-	1997-02-10 17:32:01	1997	7	1
-	2001-09-22 18:19:20	2001	38	6
+	1969-12-31 16:00:00	1970	1	4
+	1997-01-02 00:00:00	1997	1	5
+	1997-01-02 03:04:05	1997	1	5
+	1997-02-10 17:32:01	1997	7	2
+	2001-09-22 18:19:20	2001	38	7
 
 
 -- !query


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org