You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/07/01 22:49:08 UTC

[spark] branch branch-3.0 updated: [SPARK-32130][SQL] Disable the JSON option `inferTimestamp` by default

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new bdbfe6e  [SPARK-32130][SQL] Disable the JSON option `inferTimestamp` by default
bdbfe6e is described below

commit bdbfe6ec15667835000d7e95b0c437b6ab4b251e
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Wed Jul 1 15:45:39 2020 -0700

    [SPARK-32130][SQL] Disable the JSON option `inferTimestamp` by default
    
    Set the JSON option `inferTimestamp` to `false` if an user don't pass it as datasource option.
    
    To prevent perf regression while inferring schemas from JSON with potential timestamps fields.
    
    Yes
    
    - Modified existing tests in `JsonSuite` and `JsonInferSchemaSuite`.
    - Regenerated results of `JsonBenchmark` in the environment:
    
    | Item | Description |
    | ---- | ----|
    | Region | us-west-2 (Oregon) |
    | Instance | r3.xlarge |
    | AMI | ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190722.1 (ami-06f2f779464715dc5) |
    | Java | OpenJDK 64-Bit Server VM 1.8.0_252 and OpenJDK 64-Bit Server VM 11.0.7+10 |
    
    Closes #28966 from MaxGekk/json-inferTimestamps-disable-by-default.
    
    Authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
    (cherry picked from commit bcf23307f4fd70590ea10e5e9edb6e9de1f76125)
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 docs/sql-migration-guide.md                        |  4 +
 .../spark/sql/catalyst/json/JSONOptions.scala      |  2 +-
 .../sql/catalyst/json/JsonInferSchemaSuite.scala   | 56 ++++++++------
 .../benchmarks/JsonBenchmark-jdk11-results.txt     | 86 +++++++++++-----------
 sql/core/benchmarks/JsonBenchmark-results.txt      | 86 +++++++++++-----------
 .../sql/execution/datasources/json/JsonSuite.scala |  6 +-
 6 files changed, 129 insertions(+), 111 deletions(-)

diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index b7b01d0..fb3fe09 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -22,6 +22,10 @@ license: |
 * Table of contents
 {:toc}
 
+## Upgrading from Spark SQL 3.0 to 3.0.1
+
+- In Spark 3.0, JSON datasource and JSON function `schema_of_json` infer TimestampType from string values if they match to the pattern defined by the JSON option `timestampFormat`. Since version 3.0.1, the timestamp type inference is disabled by default. Set the JSON option `inferTimestamp` to `true` to enable such type inference.
+
 ## Upgrading from Spark SQL 2.4 to 3.0
 
 ### Dataset/DataFrame APIs
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
index f9222f5..70a673b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala
@@ -133,7 +133,7 @@ private[sql] class JSONOptions(
    * Enables inferring of TimestampType from strings matched to the timestamp pattern
    * defined by the timestampFormat option.
    */
-  val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(true)
+  val inferTimestamp: Boolean = parameters.get("inferTimestamp").map(_.toBoolean).getOrElse(false)
 
   /** Build a Jackson [[JsonFactory]] using JSON options. */
   def buildJsonFactory(): JsonFactory = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
index bce917c..8290b38 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/json/JsonInferSchemaSuite.scala
@@ -35,22 +35,29 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
     assert(inferSchema.inferField(parser) === expectedType)
   }
 
-  def checkTimestampType(pattern: String, json: String): Unit = {
-    checkType(Map("timestampFormat" -> pattern), json, TimestampType)
+  def checkTimestampType(pattern: String, json: String, inferTimestamp: Boolean): Unit = {
+    checkType(
+      Map("timestampFormat" -> pattern, "inferTimestamp" -> inferTimestamp.toString),
+      json,
+      if (inferTimestamp) TimestampType else StringType)
   }
 
   test("inferring timestamp type") {
-    Seq("legacy", "corrected").foreach { legacyParserPolicy =>
-      withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) {
-        checkTimestampType("yyyy", """{"a": "2018"}""")
-        checkTimestampType("yyyy=MM", """{"a": "2018=12"}""")
-        checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""")
-        checkTimestampType(
-          "yyyy-MM-dd'T'HH:mm:ss.SSS",
-          """{"a": "2018-12-02T21:04:00.123"}""")
-        checkTimestampType(
-          "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
-          """{"a": "2018-12-02T21:04:00.123567+01:00"}""")
+    Seq(true, false).foreach { inferTimestamp =>
+      Seq("legacy", "corrected").foreach { legacyParserPolicy =>
+        withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) {
+          checkTimestampType("yyyy", """{"a": "2018"}""", inferTimestamp)
+          checkTimestampType("yyyy=MM", """{"a": "2018=12"}""", inferTimestamp)
+          checkTimestampType("yyyy MM dd", """{"a": "2018 12 02"}""", inferTimestamp)
+          checkTimestampType(
+            "yyyy-MM-dd'T'HH:mm:ss.SSS",
+            """{"a": "2018-12-02T21:04:00.123"}""",
+            inferTimestamp)
+          checkTimestampType(
+            "yyyy-MM-dd'T'HH:mm:ss.SSSSSSXXX",
+            """{"a": "2018-12-02T21:04:00.123567+01:00"}""",
+            inferTimestamp)
+        }
       }
     }
   }
@@ -71,16 +78,19 @@ class JsonInferSchemaSuite extends SparkFunSuite with SQLHelper {
   }
 
   test("skip decimal type inferring") {
-    Seq("legacy", "corrected").foreach { legacyParserPolicy =>
-      withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) {
-        checkType(
-          options = Map(
-            "prefersDecimal" -> "false",
-            "timestampFormat" -> "yyyyMMdd.HHmmssSSS"
-          ),
-          json = """{"a": "20181202.210400123"}""",
-          dt = TimestampType
-        )
+    Seq(true, false).foreach { inferTimestamp =>
+      Seq("legacy", "corrected").foreach { legacyParserPolicy =>
+        withSQLConf(SQLConf.LEGACY_TIME_PARSER_POLICY.key -> legacyParserPolicy) {
+          checkType(
+            options = Map(
+              "prefersDecimal" -> "false",
+              "timestampFormat" -> "yyyyMMdd.HHmmssSSS",
+              "inferTimestamp" -> inferTimestamp.toString
+            ),
+            json = """{"a": "20181202.210400123"}""",
+            dt = if (inferTimestamp) TimestampType else StringType
+          )
+        }
       }
     }
   }
diff --git a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
index d0cd591..ff37084 100644
--- a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt
@@ -7,106 +7,106 @@ OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-106
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 JSON schema inferring:                    Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       68879          68993         116          1.5         688.8       1.0X
-UTF-8 is set                                     115270         115602         455          0.9        1152.7       0.6X
+No encoding                                       69219          69342         116          1.4         692.2       1.0X
+UTF-8 is set                                     143950         143986          55          0.7        1439.5       0.5X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 count a short column:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       47452          47538         113          2.1         474.5       1.0X
-UTF-8 is set                                      77330          77354          30          1.3         773.3       0.6X
+No encoding                                       57828          57913         136          1.7         578.3       1.0X
+UTF-8 is set                                      83649          83711          60          1.2         836.5       0.7X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 count a wide column:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       60470          60900         534          0.2        6047.0       1.0X
-UTF-8 is set                                     104733         104931         189          0.1       10473.3       0.6X
+No encoding                                       64560          65193        1023          0.2        6456.0       1.0X
+UTF-8 is set                                     102925         103174         216          0.1       10292.5       0.6X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 select wide row:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                      130302         131072         976          0.0      260604.6       1.0X
-UTF-8 is set                                     150860         151284         377          0.0      301720.1       0.9X
+No encoding                                      131002         132316        1160          0.0      262003.1       1.0X
+UTF-8 is set                                     152128         152371         332          0.0      304256.5       0.9X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Select a subset of 10 columns:            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Select 10 columns                                 18619          18684          99          0.5        1861.9       1.0X
-Select 1 column                                   24227          24270          38          0.4        2422.7       0.8X
+Select 10 columns                                 19376          19514         160          0.5        1937.6       1.0X
+Select 1 column                                   24089          24156          58          0.4        2408.9       0.8X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 creation of JSON parser per line:         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Short column without encoding                      7947           7971          21          1.3         794.7       1.0X
-Short column with UTF-8                           12700          12753          58          0.8        1270.0       0.6X
-Wide column without encoding                      92632          92955         463          0.1        9263.2       0.1X
-Wide column with UTF-8                           147013         147170         188          0.1       14701.3       0.1X
+Short column without encoding                      8131           8219         103          1.2         813.1       1.0X
+Short column with UTF-8                           13464          13508          44          0.7        1346.4       0.6X
+Wide column without encoding                     108012         108598         914          0.1       10801.2       0.1X
+Wide column with UTF-8                           150988         151369         412          0.1       15098.8       0.1X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 JSON functions:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                           713            734          19         14.0          71.3       1.0X
-from_json                                         22019          22429         456          0.5        2201.9       0.0X
-json_tuple                                        27987          28047          74          0.4        2798.7       0.0X
-get_json_object                                   21468          21870         350          0.5        2146.8       0.0X
+Text read                                           753            765          18         13.3          75.3       1.0X
+from_json                                         23182          23446         230          0.4        2318.2       0.0X
+json_tuple                                        31129          31304         181          0.3        3112.9       0.0X
+get_json_object                                   22821          23073         225          0.4        2282.1       0.0X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Dataset of json strings:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                          2887           2910          24         17.3          57.7       1.0X
-schema inferring                                  31793          31843          43          1.6         635.9       0.1X
-parsing                                           36791          37104         294          1.4         735.8       0.1X
+Text read                                          3078           3101          26         16.2          61.6       1.0X
+schema inferring                                  30225          30434         333          1.7         604.5       0.1X
+parsing                                           32237          32308          63          1.6         644.7       0.1X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Json files in the per-line mode:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                         10570          10611          45          4.7         211.4       1.0X
-Schema inferring                                  48729          48763          41          1.0         974.6       0.2X
-Parsing without charset                           35490          35648         141          1.4         709.8       0.3X
-Parsing with UTF-8                                63853          63994         163          0.8        1277.1       0.2X
+Text read                                         10835          10900          86          4.6         216.7       1.0X
+Schema inferring                                  37720          37805         110          1.3         754.4       0.3X
+Parsing without charset                           35464          35538         100          1.4         709.3       0.3X
+Parsing with UTF-8                                67311          67738         381          0.7        1346.2       0.2X
 
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Write dates and timestamps:               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps                     2187           2190           5          4.6         218.7       1.0X
-to_json(timestamp)                                16262          16503         323          0.6        1626.2       0.1X
-write timestamps to files                         11679          11692          12          0.9        1167.9       0.2X
-Create a dataset of dates                          2297           2310          12          4.4         229.7       1.0X
-to_json(date)                                     10904          10956          46          0.9        1090.4       0.2X
-write dates to files                               6610           6645          35          1.5         661.0       0.3X
+Create a dataset of timestamps                     2208           2222          14          4.5         220.8       1.0X
+to_json(timestamp)                                14299          14570         285          0.7        1429.9       0.2X
+write timestamps to files                         12955          12969          13          0.8        1295.5       0.2X
+Create a dataset of dates                          2297           2323          30          4.4         229.7       1.0X
+to_json(date)                                      8509           8561          74          1.2         850.9       0.3X
+write dates to files                               6786           6827          45          1.5         678.6       0.3X
 
 OpenJDK 64-Bit Server VM 11.0.7+10-post-Ubuntu-2ubuntu218.04 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Read dates and timestamps:                Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files                     2524           2530           9          4.0         252.4       1.0X
-read timestamps from files                        41002          41052          59          0.2        4100.2       0.1X
-infer timestamps from files                       84621          84939         526          0.1        8462.1       0.0X
-read date text from files                          2292           2302           9          4.4         229.2       1.1X
-read date from files                              16954          16976          21          0.6        1695.4       0.1X
-timestamp strings                                  3067           3077          13          3.3         306.7       0.8X
-parse timestamps from Dataset[String]             48690          48971         243          0.2        4869.0       0.1X
-infer timestamps from Dataset[String]             97463          97786         338          0.1        9746.3       0.0X
-date strings                                       3952           3956           3          2.5         395.2       0.6X
-parse dates from Dataset[String]                  24210          24241          30          0.4        2421.0       0.1X
-from_json(timestamp)                              71710          72242         629          0.1        7171.0       0.0X
-from_json(date)                                   42465          42481          13          0.2        4246.5       0.1X
+read timestamp text from files                     2598           2613          18          3.8         259.8       1.0X
+read timestamps from files                        42007          42028          19          0.2        4200.7       0.1X
+infer timestamps from files                       18102          18120          28          0.6        1810.2       0.1X
+read date text from files                          2355           2360           5          4.2         235.5       1.1X
+read date from files                              17420          17458          33          0.6        1742.0       0.1X
+timestamp strings                                  3099           3101           3          3.2         309.9       0.8X
+parse timestamps from Dataset[String]             48188          48215          25          0.2        4818.8       0.1X
+infer timestamps from Dataset[String]             22929          22988         102          0.4        2292.9       0.1X
+date strings                                       4090           4103          11          2.4         409.0       0.6X
+parse dates from Dataset[String]                  24952          25068         139          0.4        2495.2       0.1X
+from_json(timestamp)                              66038          66352         413          0.2        6603.8       0.0X
+from_json(date)                                   43755          43782          27          0.2        4375.5       0.1X
 
 
diff --git a/sql/core/benchmarks/JsonBenchmark-results.txt b/sql/core/benchmarks/JsonBenchmark-results.txt
index 46d2410..0e4ce90 100644
--- a/sql/core/benchmarks/JsonBenchmark-results.txt
+++ b/sql/core/benchmarks/JsonBenchmark-results.txt
@@ -7,106 +7,106 @@ OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aw
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 JSON schema inferring:                    Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       63981          64044          56          1.6         639.8       1.0X
-UTF-8 is set                                     112672         113350         962          0.9        1126.7       0.6X
+No encoding                                       64950          65182         306          1.5         649.5       1.0X
+UTF-8 is set                                     129566         129796         229          0.8        1295.7       0.5X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 count a short column:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       51256          51449         180          2.0         512.6       1.0X
-UTF-8 is set                                      83694          83859         148          1.2         836.9       0.6X
+No encoding                                       50896          51277         372          2.0         509.0       1.0X
+UTF-8 is set                                      89712          89763          49          1.1         897.1       0.6X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 count a wide column:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       58440          59097         569          0.2        5844.0       1.0X
-UTF-8 is set                                     102746         102883         198          0.1       10274.6       0.6X
+No encoding                                       59415          59785         372          0.2        5941.5       1.0X
+UTF-8 is set                                     103059         103165         156          0.1       10305.9       0.6X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 select wide row:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                      128982         129304         356          0.0      257965.0       1.0X
-UTF-8 is set                                     147247         147415         231          0.0      294494.1       0.9X
+No encoding                                      132951         133122         288          0.0      265901.9       1.0X
+UTF-8 is set                                     149318         149441         107          0.0      298635.3       0.9X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Select a subset of 10 columns:            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Select 10 columns                                 18837          19048         331          0.5        1883.7       1.0X
-Select 1 column                                   24707          24723          14          0.4        2470.7       0.8X
+Select 10 columns                                 18491          18552          85          0.5        1849.1       1.0X
+Select 1 column                                   25908          25946          65          0.4        2590.8       0.7X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 creation of JSON parser per line:         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Short column without encoding                      8218           8234          17          1.2         821.8       1.0X
-Short column with UTF-8                           12374          12438         107          0.8        1237.4       0.7X
-Wide column without encoding                     136918         137298         345          0.1       13691.8       0.1X
-Wide column with UTF-8                           176961         177142         257          0.1       17696.1       0.0X
+Short column without encoding                      9264           9307          49          1.1         926.4       1.0X
+Short column with UTF-8                           14707          14727          17          0.7        1470.7       0.6X
+Wide column without encoding                     141138         141347         276          0.1       14113.8       0.1X
+Wide column with UTF-8                           179601         180035         664          0.1       17960.1       0.1X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 JSON functions:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                          1268           1278          12          7.9         126.8       1.0X
-from_json                                         23348          23479         176          0.4        2334.8       0.1X
-json_tuple                                        29606          30221        1024          0.3        2960.6       0.0X
-get_json_object                                   21898          22148         226          0.5        2189.8       0.1X
+Text read                                          1173           1184           9          8.5         117.3       1.0X
+from_json                                         23432          23738         338          0.4        2343.2       0.1X
+json_tuple                                        32573          32851         358          0.3        3257.3       0.0X
+get_json_object                                   22442          22489          47          0.4        2244.2       0.1X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Dataset of json strings:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                          5887           5944          49          8.5         117.7       1.0X
-schema inferring                                  46696          47054         312          1.1         933.9       0.1X
-parsing                                           32336          32450         129          1.5         646.7       0.2X
+Text read                                          5656           5680          31          8.8         113.1       1.0X
+schema inferring                                  33283          33337          64          1.5         665.7       0.2X
+parsing                                           41771          41929         178          1.2         835.4       0.1X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Json files in the per-line mode:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                          9756           9769          11          5.1         195.1       1.0X
-Schema inferring                                  51318          51433         108          1.0        1026.4       0.2X
-Parsing without charset                           43609          43743         118          1.1         872.2       0.2X
-Parsing with UTF-8                                60775          60844         106          0.8        1215.5       0.2X
+Text read                                          9626           9668          39          5.2         192.5       1.0X
+Schema inferring                                  39489          39579          91          1.3         789.8       0.2X
+Parsing without charset                           38096          38232         125          1.3         761.9       0.3X
+Parsing with UTF-8                                64565          64725         165          0.8        1291.3       0.1X
 
 OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Write dates and timestamps:               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Create a dataset of timestamps                     1998           2015          17          5.0         199.8       1.0X
-to_json(timestamp)                                18156          18317         263          0.6        1815.6       0.1X
-write timestamps to files                         12912          12917           5          0.8        1291.2       0.2X
-Create a dataset of dates                          2209           2270          53          4.5         220.9       0.9X
-to_json(date)                                      9433           9489          90          1.1         943.3       0.2X
-write dates to files                               6915           6923           8          1.4         691.5       0.3X
+Create a dataset of timestamps                     1898           1912          13          5.3         189.8       1.0X
+to_json(timestamp)                                20011          20092         119          0.5        2001.1       0.1X
+write timestamps to files                         13388          13427          35          0.7        1338.8       0.1X
+Create a dataset of dates                          2351           2368          18          4.3         235.1       0.8X
+to_json(date)                                     11884          11913          40          0.8        1188.4       0.2X
+write dates to files                               7317           7326           9          1.4         731.7       0.3X
 
 OpenJDK 64-Bit Server VM 1.8.0_252-8u252-b09-1~18.04-b09 on Linux 4.15.0-1063-aws
 Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
 Read dates and timestamps:                Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-read timestamp text from files                     2395           2412          17          4.2         239.5       1.0X
-read timestamps from files                        47269          47334          89          0.2        4726.9       0.1X
-infer timestamps from files                       91806          91851          67          0.1        9180.6       0.0X
-read date text from files                          2118           2133          13          4.7         211.8       1.1X
-read date from files                              17267          17340         115          0.6        1726.7       0.1X
-timestamp strings                                  3906           3935          26          2.6         390.6       0.6X
-parse timestamps from Dataset[String]             52244          52534         279          0.2        5224.4       0.0X
-infer timestamps from Dataset[String]            100488         100714         198          0.1       10048.8       0.0X
-date strings                                       4572           4584          12          2.2         457.2       0.5X
-parse dates from Dataset[String]                  26749          26768          17          0.4        2674.9       0.1X
-from_json(timestamp)                              71414          71867         556          0.1        7141.4       0.0X
-from_json(date)                                   45322          45549         250          0.2        4532.2       0.1X
+read timestamp text from files                     2316           2324          13          4.3         231.6       1.0X
+read timestamps from files                        43712          43900         165          0.2        4371.2       0.1X
+infer timestamps from files                       19302          19328          38          0.5        1930.2       0.1X
+read date text from files                          2090           2099          11          4.8         209.0       1.1X
+read date from files                              18914          18940          44          0.5        1891.4       0.1X
+timestamp strings                                  3785           3793          11          2.6         378.5       0.6X
+parse timestamps from Dataset[String]             51177          51353         160          0.2        5117.7       0.0X
+infer timestamps from Dataset[String]             27907          28119         186          0.4        2790.7       0.1X
+date strings                                       4446           4452           6          2.2         444.6       0.5X
+parse dates from Dataset[String]                  28124          28172          55          0.4        2812.4       0.1X
+from_json(timestamp)                              71432          71827         354          0.1        7143.2       0.0X
+from_json(date)                                   46497          46651         163          0.2        4649.7       0.0X
 
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 19ec586..d95d4e0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -2610,7 +2610,9 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson
   }
 
   test("inferring timestamp type") {
-    def schemaOf(jsons: String*): StructType = spark.read.json(jsons.toDS).schema
+    def schemaOf(jsons: String*): StructType = {
+      spark.read.option("inferTimestamp", true).json(jsons.toDS).schema
+    }
 
     assert(schemaOf(
       """{"a":"2018-12-17T10:11:12.123-01:00"}""",
@@ -2633,6 +2635,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson
       val timestampsWithFormatPath = s"${dir.getCanonicalPath}/timestampsWithFormat.json"
       val timestampsWithFormat = spark.read
         .option("timestampFormat", "dd/MM/yyyy HH:mm")
+        .option("inferTimestamp", true)
         .json(datesRecords)
       assert(timestampsWithFormat.schema === customSchema)
 
@@ -2645,6 +2648,7 @@ abstract class JsonSuite extends QueryTest with SharedSparkSession with TestJson
       val readBack = spark.read
         .option("timestampFormat", "yyyy-MM-dd HH:mm:ss")
         .option(DateTimeUtils.TIMEZONE_OPTION, "UTC")
+        .option("inferTimestamp", true)
         .json(timestampsWithFormatPath)
 
       assert(readBack.schema === customSchema)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org