You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/05/05 14:17:00 UTC

[spark] branch branch-3.0 updated: [SPARK-31641][SQL] Fix days conversions by JSON legacy parser

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 080c51e  [SPARK-31641][SQL] Fix days conversions by JSON legacy parser
080c51e is described below

commit 080c51e6b6268002948dd14171233bd35d954529
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Tue May 5 14:15:31 2020 +0000

    [SPARK-31641][SQL] Fix days conversions by JSON legacy parser
    
    ### What changes were proposed in this pull request?
    Perform days rebasing while converting days from JSON string field. In Spark 2.4 and earlier versions, the days are interpreted as days since the epoch in the hybrid calendar (Julian + Gregorian since 1582-10-15). Since Spark 3.0, the base calendar was switched to Proleptic Gregorian calendar, so, the days should be rebased to represent the same local date.
    
    ### Why are the changes needed?
    The changes fix a bug and restore compatibility with Spark 2.4 in which:
    ```scala
    scala> spark.read.schema("d date").json(Seq("{'d': '-141704'}").toDS).show
    +----------+
    |         d|
    +----------+
    |1582-01-01|
    +----------+
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    Yes.
    
    Before:
    ```scala
    scala> spark.read.schema("d date").json(Seq("{'d': '-141704'}").toDS).show
    +----------+
    |         d|
    +----------+
    |1582-01-11|
    +----------+
    ```
    
    After:
    ```scala
    scala> spark.read.schema("d date").json(Seq("{'d': '-141704'}").toDS).show
    +----------+
    |         d|
    +----------+
    |1582-01-01|
    +----------+
    ```
    
    ### How was this patch tested?
    Add a test to `JsonSuite`.
    
    Closes #28453 from MaxGekk/json-rebase-legacy-days.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit bd264299317bba91f2dc1dc27fd51e6bc0609d66)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../org/apache/spark/sql/catalyst/json/JacksonParser.scala   |  2 +-
 .../spark/sql/execution/datasources/json/JsonSuite.scala     | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
index 8965a81..a52c345 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -259,7 +259,7 @@ class JacksonParser(
                 // In Spark 1.5.0, we store the data as number of days since epoch in string.
                 // So, we just convert it to Int.
                 try {
-                  parser.getText.toInt
+                  RebaseDateTime.rebaseJulianToGregorianDays(parser.getText.toInt)
                 } catch {
                   case _: NumberFormatException => throw e
                 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 999eadb..4982991 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -2653,13 +2653,17 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson
     }
   }
 
-  test("SPARK-30960: parse date/timestamp string with legacy format") {
-    val ds = Seq("{'t': '2020-1-12 3:23:34.12', 'd': '2020-1-12 T', 'd2': '12345'}").toDS()
-    val json = spark.read.schema("t timestamp, d date, d2 date").json(ds)
+  test("SPARK-30960, SPARK-31641: parse date/timestamp string with legacy format") {
+    val julianDay = -141704 // 1582-01-01 in Julian calendar
+    val ds = Seq(
+      s"{'t': '2020-1-12 3:23:34.12', 'd': '2020-1-12 T', 'd2': '12345', 'd3': '$julianDay'}"
+    ).toDS()
+    val json = spark.read.schema("t timestamp, d date, d2 date, d3 date").json(ds)
     checkAnswer(json, Row(
       Timestamp.valueOf("2020-1-12 3:23:34.12"),
       Date.valueOf("2020-1-12"),
-      Date.valueOf(LocalDate.ofEpochDay(12345))))
+      Date.valueOf(LocalDate.ofEpochDay(12345)),
+      Date.valueOf("1582-01-01")))
   }
 
   test("exception mode for parsing date/timestamp string") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org