You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2019/01/05 13:50:42 UTC

[spark] branch master updated: [SPARK-26246][SQL][FOLLOWUP] Inferring TimestampType from JSON

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 980e6bc  [SPARK-26246][SQL][FOLLOWUP] Inferring TimestampType from JSON
980e6bc is described below

commit 980e6bcd1c016139c6918d788fb4806a60740fcf
Author: Maxim Gekk <ma...@databricks.com>
AuthorDate: Sat Jan 5 21:50:27 2019 +0800

    [SPARK-26246][SQL][FOLLOWUP] Inferring TimestampType from JSON
    
    ## What changes were proposed in this pull request?
    
    Added new JSON option `inferTimestamp` (`true` by default) to control inferring of `TimestampType` from string values.
    
    ## How was this patch tested?
    
    Add new UT to `JsonInferSchemaSuite`.
    
    Closes #23455 from MaxGekk/json-infer-time-followup.
    
    Authored-by: Maxim Gekk <ma...@databricks.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 docs/sql-migration-guide-upgrade.md                                 | 3 +++
 .../main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala | 6 ++++++
 .../scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala  | 3 ++-
 .../org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala   | 6 ++++++
 4 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md
index c4d2157..7e6a0c0 100644
--- a/docs/sql-migration-guide-upgrade.md
+++ b/docs/sql-migration-guide-upgrade.md
@@ -40,6 +40,9 @@ displayTitle: Spark SQL Upgrading Guide
   - In Spark version 2.4 and earlier, JSON datasource and JSON functions like `from_json` convert a bad JSON record to a row with all `null`s in the PERMISSIVE mode when specified schema is `StructType`. Since Spark 3.0, the returned row can contain non-`null` fields if some of JSON column values were parsed and converted to desired types successfully.
 
   - Since Spark 3.0, the `unix_timestamp`, `date_format`, `to_unix_timestamp`, `from_unixtime`, `to_date`, `to_timestamp` functions use java.time API for parsing and formatting dates/timestamps from/to strings by using ISO chronology (https://docs.oracle.com/javase/8/docs/api/java/time/chrono/IsoChronology.html) based on Proleptic Gregorian calendar. In Spark version 2.4 and earlier, java.text.SimpleDateFormat and java.util.GregorianCalendar (hybrid calendar that supports both the Julian [...]
+
+  - Since Spark 3.0, JSON datasource and JSON function `schema_of_json` infer TimestampType from string values if they matches to the pattern defined by the JSON option `timestampFormat`. Set JSON option `inferTimestamp` to `false` to disable such type inferring.
+
 ## Upgrading From Spark SQL 2.3 to 2.4
 
   - In Spark version 2.3 and earlier, the second parameter to array_contains function is implicitly promoted to the element type of first array type parameter. This type promotion can be lossy and may cause `array_contains` function to return wrong result. This problem has been addressed in 2.4 by employing a safer type promotion mechanism. This can cause some change in behavior and are illustrated in the table below.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
index eaff3fa..1ec9d50 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -117,6 +117,12 @@ private[sql] class JSONOptions(
    */
   val pretty: Boolean = parameters.get("pretty").map(_.toBoolean).getOrElse(false)
 
+  /**
+   * Enables inferring of TimestampType from strings matched to the timestamp pattern
+   * defined by the timestampFormat option.
+   */
+  val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(true)
+
   /** Sets config options on a Jackson [[JsonFactory]]. */
   def setJacksonOptions(factory: JsonFactory): Unit = {
     factory.configure(JsonParser.Feature.ALLOW_COMMENTS, allowComments)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
index 3203e62..0bf3f03c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JsonInferSchema.scala
@@ -128,7 +128,8 @@ private[sql] class JsonInferSchema(options: JSONOptions) extends Serializable {
         }
         if (options.prefersDecimal && decimalTry.isDefined) {
           decimalTry.get
-        } else if ((allCatch opt timestampFormatter.parse(field)).isDefined) {
+        } else if (options.inferTimestamp &&
+            (allCatch opt timestampFormatter.parse(field)).isDefined) {
           TimestampType
         } else {
           StringType
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
index 9307f9b..9a6f4f5 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
@@ -99,4 +99,10 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
       }
     }
   }
+
+  test("disable timestamp inferring") {
+    val json = """{"a": "2019-01-04T21:11:10.123Z"}"""
+    checkType(Map("inferTimestamp" -> "true"), json, TimestampType)
+    checkType(Map("inferTimestamp" -> "false"), json, StringType)
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org