You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/06/03 06:24:33 UTC

[spark] branch branch-3.0 updated: [SPARK-31879][SQL] Using GB as default Locale for datetime formatters

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 1b7ae62  [SPARK-31879][SQL] Using GB as default Locale for datetime formatters
1b7ae62 is described below

commit 1b7ae62bf443f20c94daa6ea5cabbd18f96b7919
Author: Kent Yao <ya...@hotmail.com>
AuthorDate: Wed Jun 3 06:07:53 2020 +0000

    [SPARK-31879][SQL] Using GB as default Locale for datetime formatters
    
    This PR switches the default Locale from the `US` to `GB` to change the behavior of the first day of the week from Sunday-started to Monday-started as same as v2.4
    
    ```sql
    spark-sql> select to_timestamp('2020-1-1', 'YYYY-w-u');
    2019-12-29 00:00:00
    spark-sql> set spark.sql.legacy.timeParserPolicy=legacy;
    spark.sql.legacy.timeParserPolicy	legacy
    spark-sql> select to_timestamp('2020-1-1', 'YYYY-w-u');
    2019-12-30 00:00:00
    ```
    
    These week-based fields need Locale to express their semantics, the first day of the week varies from country to country.
    
    From the Java doc of WeekFields
    ```java
        /**
         * Gets the first day-of-week.
         * <p>
         * The first day-of-week varies by culture.
         * For example, the US uses Sunday, while France and the ISO-8601 standard use Monday.
         * This method returns the first day using the standard {code DayOfWeek} enum.
         *
         * return the first day-of-week, not null
         */
        public DayOfWeek getFirstDayOfWeek() {
            return firstDayOfWeek;
        }
    ```
    
    But for the SimpleDateFormat, the day-of-week is not localized
    
    ```
    u	Day number of week (1 = Monday, ..., 7 = Sunday)	Number	1
    ```
    
    Currently, the default locale we use is the US, so the result moved a day backward.
    
    For other countries, please refer to [First Day of the Week in Different Countries](http://chartsbin.com/view/41671)
    
    With this change, it restores the first day of week calculating for functions when using the default locale.
    
    Yes, but the behavior change is used to restore the old one of v2.4
    
    add unit tests
    
    Closes #28692 from yaooqinn/SPARK-31879.
    
    Authored-by: Kent Yao <ya...@hotmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit c59f51bcc207725b8cbc4201df9367f874f5915c)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../apache/spark/sql/catalyst/util/DateFormatter.scala |  8 +++++++-
 .../spark/sql/catalyst/util/TimestampFormatter.scala   |  8 +++++++-
 .../src/test/resources/sql-tests/inputs/datetime.sql   |  4 ++++
 .../resources/sql-tests/results/ansi/datetime.sql.out  | 18 +++++++++++++++++-
 .../sql-tests/results/datetime-legacy.sql.out          | 18 +++++++++++++++++-
 .../test/resources/sql-tests/results/datetime.sql.out  | 18 +++++++++++++++++-
 6 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
index c4cedf2..5711ea0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateFormatter.scala
@@ -117,7 +117,13 @@ class LegacySimpleDateFormatter(pattern: String, locale: Locale) extends LegacyD
 object DateFormatter {
   import LegacyDateFormats._
 
-  val defaultLocale: Locale = Locale.US
+  /**
+   * Before Spark 3.0, the first day-of-week is always Monday. Since Spark 3.0, it depends on the
+   * locale.
+   * We pick GB as the default locale instead of US, to be compatible with Spark 2.x, as US locale
+   * uses Sunday as the first day-of-week. See SPARK-31879.
+   */
+  val defaultLocale: Locale = new Locale("en", "GB")
 
   val defaultPattern: String = "yyyy-MM-dd"
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
index 6d1c535..252e703 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/TimestampFormatter.scala
@@ -257,7 +257,13 @@ object LegacyDateFormats extends Enumeration {
 object TimestampFormatter {
   import LegacyDateFormats._
 
-  val defaultLocale: Locale = Locale.US
+  /**
+   * Before Spark 3.0, the first day-of-week is always Monday. Since Spark 3.0, it depends on the
+   * locale.
+   * We pick GB as the default locale instead of US, to be compatible with Spark 2.x, as US locale
+   * uses Sunday as the first day-of-week. See SPARK-31879.
+   */
+  val defaultLocale: Locale = new Locale("en", "GB")
 
   def defaultPattern(): String = s"${DateFormatter.defaultPattern} HH:mm:ss"
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
index 4eefa0f..e955c78 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/datetime.sql
@@ -154,3 +154,7 @@ select from_csv('26/October/2015', 'date Date', map('dateFormat', 'dd/MMMMM/yyyy
 select from_unixtime(1, 'yyyyyyyyyyy-MM-dd');
 select date_format(timestamp '2018-11-17 13:33:33', 'yyyyyyyyyy-MM-dd HH:mm:ss');
 select date_format(date '2018-11-17', 'yyyyyyyyyyy-MM-dd');
+
+-- SPARK-31879: the first day of week
+select date_format('2020-01-01', 'YYYY-MM-dd uu');
+select date_format('2020-01-01', 'YYYY-MM-dd uuuu');
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
index 43fe0a6..c3d10b0 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/datetime.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 112
+-- Number of queries: 114
 
 
 -- !query
@@ -965,3 +965,19 @@ struct<>
 -- !query output
 org.apache.spark.SparkUpgradeException
 You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'yyyyyyyyyyy-MM-dd' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uu):string>
+-- !query output
+2020-01-01 03
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uuuu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uuuu):string>
+-- !query output
+2020-01-01 Wednesday
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out
index 71b1064..ac50c5b 100644
--- a/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/datetime-legacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 112
+-- Number of queries: 114
 
 
 -- !query
@@ -920,3 +920,19 @@ select date_format(date '2018-11-17', 'yyyyyyyyyyy-MM-dd')
 struct<date_format(CAST(DATE '2018-11-17' AS TIMESTAMP), yyyyyyyyyyy-MM-dd):string>
 -- !query output
 00000002018-11-17
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uu):string>
+-- !query output
+2020-01-01 03
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uuuu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uuuu):string>
+-- !query output
+2020-01-01 0003
diff --git a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
index 9b1c847..fd037db 100755
--- a/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/datetime.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 112
+-- Number of queries: 114
 
 
 -- !query
@@ -937,3 +937,19 @@ struct<>
 -- !query output
 org.apache.spark.SparkUpgradeException
 You may get a different result due to the upgrading of Spark 3.0: Fail to recognize 'yyyyyyyyyyy-MM-dd' pattern in the DateTimeFormatter. 1) You can set spark.sql.legacy.timeParserPolicy to LEGACY to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uu):string>
+-- !query output
+2020-01-01 03
+
+
+-- !query
+select date_format('2020-01-01', 'YYYY-MM-dd uuuu')
+-- !query schema
+struct<date_format(CAST(2020-01-01 AS TIMESTAMP), YYYY-MM-dd uuuu):string>
+-- !query output
+2020-01-01 Wednesday


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org