You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2022/09/21 05:17:49 UTC
[spark] branch master updated: [SPARK-40496][SQL] Fix configs to control "enableDateTimeParsingFallback"

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new a031aaa487e [SPARK-40496][SQL] Fix configs to control "enableDateTimeParsingFallback"
a031aaa487e is described below

commit a031aaa487e8dc928ff3431ce2f3312236531bd4
Author: Ivan Sadikov <iv...@databricks.com>
AuthorDate: Wed Sep 21 13:17:21 2022 +0800

    [SPARK-40496][SQL] Fix configs to control "enableDateTimeParsingFallback"
    
    ### What changes were proposed in this pull request?
    
    The commit https://github.com/apache/spark/commit/a93044550259fa0ee8897d0576f6eeac8ec73c27 introduced `enableDateTimeParsingFallback` config but the usage was incorrect in CSV/UnivocityParser - DateType and TimestampType configs were swapped.
    
    This PR fixes the issue so `enableParsingFallbackForDateType` controls DateType fallback and `enableParsingFallbackForTimestampType` controls TimestampType.
    
    JSON does not have this problem - the property is correctly used for the corresponding data types in https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala#L305.
    
    Setting parsing fallback via data source option or SQL config will work correctly, the issue would only happen when using `dateFormat` or `timestampFormat` without setting the flag explicitly.
    
    ### Why are the changes needed?
    
    Correctness fix. Without the change, the fallback would not be enabled when using dateFormat and timestampFormat. All other means to set parsing fallback (data source option and SQLConf) will still work correctly.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    I added a unit test for CSV and JSON to reproduce the issue and verify it was fixed.
    
    Closes #37942 from sadikovi/SPARK-40496.
    
    Authored-by: Ivan Sadikov <iv...@databricks.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../spark/sql/catalyst/csv/UnivocityParser.scala   |  4 ++--
 .../sql/execution/datasources/csv/CSVSuite.scala   | 25 ++++++++++++++++++++++
 .../sql/execution/datasources/json/JsonSuite.scala | 25 ++++++++++++++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
index 9d855d1a93d..160f6beb09b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/UnivocityParser.scala
@@ -224,7 +224,7 @@ class UnivocityParser(
           case NonFatal(e) =>
             // If fails to parse, then tries the way used in 2.0 and 1.x for backwards
             // compatibility if enabled.
-            if (!enableParsingFallbackForTimestampType) {
+            if (!enableParsingFallbackForDateType) {
               throw e
             }
             val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(datum))
@@ -244,7 +244,7 @@ class UnivocityParser(
             } else {
               // If fails to parse, then tries the way used in 2.0 and 1.x for backwards
               // compatibility if enabled.
-              if (!enableParsingFallbackForDateType) {
+              if (!enableParsingFallbackForTimestampType) {
                 throw e
               }
               val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(datum))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index f74f7a00c13..a82b33fb0ee 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -3008,6 +3008,31 @@ abstract class CSVSuite
       }
     }
   }
+
+  test("SPARK-40496: disable parsing fallback when the date/timestamp format is provided") {
+    // The test verifies that the fallback can be disabled by providing dateFormat or
+    // timestampFormat without any additional configuration.
+    //
+    // We also need to disable "legacy" parsing mode that implicitly enables parsing fallback.
+    for (policy <- Seq("exception", "corrected")) {
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> policy) {
+        withTempPath { path =>
+          Seq("2020-01-01").toDF()
+            .repartition(1)
+            .write.text(path.getAbsolutePath)
+
+          var df = spark.read.schema("col date").option("dateFormat", "yyyy/MM/dd")
+            .csv(path.getAbsolutePath)
+          checkAnswer(df, Seq(Row(null)))
+
+          df = spark.read.schema("col timestamp").option("timestampFormat", "yyyy/MM/dd HH:mm:ss")
+            .csv(path.getAbsolutePath)
+
+          checkAnswer(df, Seq(Row(null)))
+        }
+      }
+    }
+  }
 }
 
 class CSVv1Suite extends CSVSuite {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index a9e1d3a751e..bc123a4eedb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -3356,6 +3356,31 @@ abstract class JsonSuite
       }
     }
   }
+
+  test("SPARK-40496: disable parsing fallback when the date/timestamp format is provided") {
+    // The test verifies that the fallback can be disabled by providing dateFormat or
+    // timestampFormat without any additional configuration.
+    //
+    // We also need to disable "legacy" parsing mode that implicitly enables parsing fallback.
+    for (policy <- Seq("exception", "corrected")) {
+      withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> policy) {
+        withTempPath { path =>
+          Seq("""{"col": "2020-01-01"}""").toDF()
+            .repartition(1)
+            .write.text(path.getAbsolutePath)
+
+          var df = spark.read.schema("col date").option("dateFormat", "yyyy/MM/dd")
+            .json(path.getAbsolutePath)
+          checkAnswer(df, Seq(Row(null)))
+
+          df = spark.read.schema("col timestamp").option("timestampFormat", "yyyy/MM/dd HH:mm:ss")
+            .json(path.getAbsolutePath)
+
+          checkAnswer(df, Seq(Row(null)))
+        }
+      }
+    }
+  }
 }
 
 class JsonV1Suite extends JsonSuite {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org