You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2019/04/23 02:09:41 UTC

[spark] branch master updated: [SPARK-27535][SQL][TEST] Date and timestamp JSON benchmarks

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 93a264d  [SPARK-27535][SQL][TEST] Date and timestamp JSON benchmarks
93a264d is described below

commit 93a264d05a55c2617d34e977dbaf182987187a27
Author: Maxim Gekk <ma...@gmail.com>
AuthorDate: Tue Apr 23 11:09:14 2019 +0900

    [SPARK-27535][SQL][TEST] Date and timestamp JSON benchmarks
    
    ## What changes were proposed in this pull request?
    
    Added new JSON benchmarks related to date and timestamps operations:
    - Write date/timestamp to JSON files
    - `to_json()` and `from_json()` for dates and timestamps
    - Read date/timestamps from JSON files, and infer schemas
    - Parse and infer schemas from `Dataset[String]`
    
    Also existing JSON benchmarks are ported on `NoOp` datasource.
    
    Closes #24430 from MaxGekk/json-datetime-benchmark.
    
    Authored-by: Maxim Gekk <ma...@gmail.com>
    Signed-off-by: HyukjinKwon <gu...@apache.org>
---
 sql/core/benchmarks/JSONBenchmark-results.txt      |  79 +++++++++----
 .../execution/datasources/json/JsonBenchmark.scala | 126 ++++++++++++++++++++-
 2 files changed, 179 insertions(+), 26 deletions(-)

diff --git a/sql/core/benchmarks/JSONBenchmark-results.txt b/sql/core/benchmarks/JSONBenchmark-results.txt
index 2b784c3..7846983 100644
--- a/sql/core/benchmarks/JSONBenchmark-results.txt
+++ b/sql/core/benchmarks/JSONBenchmark-results.txt
@@ -7,77 +7,106 @@ Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4
 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
 JSON schema inferring:                    Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       51280          51722         420          2.0         512.8       1.0X
-UTF-8 is set                                      75009          77276        1963          1.3         750.1       0.7X
+No encoding                                       50949          51086         150          2.0         509.5       1.0X
+UTF-8 is set                                      72012          72147         120          1.4         720.1       0.7X
 
 Preparing data for benchmarking ...
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4
 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
 count a short column:                     Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       39675          39738          83          2.5         396.7       1.0X
-UTF-8 is set                                      62755          64399        1436          1.6         627.5       0.6X
+No encoding                                       36799          36891          80          2.7         368.0       1.0X
+UTF-8 is set                                      59796          59880          74          1.7         598.0       0.6X
 
 Preparing data for benchmarking ...
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4
 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
 count a wide column:                      Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       56429          56468          65          0.2        5642.9       1.0X
-UTF-8 is set                                      81078          81454         374          0.1        8107.8       0.7X
+No encoding                                       55803          55967         152          0.2        5580.3       1.0X
+UTF-8 is set                                      80623          80825         178          0.1        8062.3       0.7X
 
 Preparing data for benchmarking ...
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4
 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
 select wide row:                          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-No encoding                                       95329          95557         265          0.0      190658.2       1.0X
-UTF-8 is set                                     102827         102967         166          0.0      205654.2       0.9X
+No encoding                                       84263          85750        1476          0.0      168526.2       1.0X
+UTF-8 is set                                      98848         100183        1592          0.0      197696.0       0.9X
 
 Preparing data for benchmarking ...
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4
 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
 Select a subset of 10 columns:            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Select 10 columns                                 14102          14136          52          0.7        1410.2       1.0X
-Select 1 column                                   17487          17537          51          0.6        1748.7       0.8X
+Select 10 columns                                 13930          13996          60          0.7        1393.0       1.0X
+Select 1 column                                   17092          17394         360          0.6        1709.2       0.8X
 
 Preparing data for benchmarking ...
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4
 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
 creation of JSON parser per line:         Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Short column without encoding                      6013           6066          70          1.7         601.3       1.0X
-Short column with UTF-8                            8031           8079          45          1.2         803.1       0.7X
-Wide column without encoding                     107093         108539         NaN          0.1       10709.3       0.1X
-Wide column with UTF-8                           130983         132518        1346          0.1       13098.3       0.0X
+Short column without encoding                      5596           5711         101          1.8         559.6       1.0X
+Short column with UTF-8                            7983           8158         160          1.3         798.3       0.7X
+Wide column without encoding                     110189         118451         NaN          0.1       11018.9       0.1X
+Wide column with UTF-8                           137827         142813         NaN          0.1       13782.7       0.0X
 
 Preparing data for benchmarking ...
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4
 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
 JSON functions:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                           939            950          11         10.6          93.9       1.0X
-from_json                                         12924          12944          26          0.8        1292.4       0.1X
-json_tuple                                        15312          15771         432          0.7        1531.2       0.1X
-get_json_object                                   13049          13475         714          0.8        1304.9       0.1X
+Text read                                           951            953           2         10.5          95.1       1.0X
+from_json                                         13015          13045          27          0.8        1301.5       0.1X
+json_tuple                                        16257          16306          43          0.6        1625.7       0.1X
+get_json_object                                   13195          13225          39          0.8        1319.5       0.1X
 
 Preparing data for benchmarking ...
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4
 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
 Dataset of json strings:                  Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                          4556           4630         108         11.0          91.1       1.0X
-schema inferring                                  23624          24338         626          2.1         472.5       0.2X
-parsing                                           22342          22420          81          2.2         446.8       0.2X
+Text read                                          4632           4687          49         10.8          92.6       1.0X
+schema inferring                                  29176          29297         146          1.7         583.5       0.2X
+parsing                                           24268          24457         175          2.1         485.4       0.2X
 
 Preparing data for benchmarking ...
 Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4
 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
 Json files in the per-line mode:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Text read                                          7537           7556          26          6.6         150.7       1.0X
-Schema inferring                                  27875          28306         499          1.8         557.5       0.3X
-Parsing without charset                           26030          26083          67          1.9         520.6       0.3X
-Parsing with UTF-8                                37115          37480         392          1.3         742.3       0.2X
+Text read                                          8264           8272           7          6.1         165.3       1.0X
+Schema inferring                                  31910          32375         543          1.6         638.2       0.3X
+Parsing without charset                           29290          29397         124          1.7         585.8       0.3X
+Parsing with UTF-8                                41301          41390          81          1.2         826.0       0.2X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4
+Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
+Write dates and timestamps:               Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Create a dataset of timestamps                     1149           1160          11          8.7         114.9       1.0X
+to_json(timestamp)                                11585          11688         140          0.9        1158.5       0.1X
+write timestamps to files                         10212          10260          49          1.0        1021.2       0.1X
+Create a dataset of dates                          1322           1328          10          7.6         132.2       0.9X
+to_json(date)                                      7226           7241          14          1.4         722.6       0.2X
+write dates to files                               5634           5648          20          1.8         563.4       0.2X
+
+Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4
+Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
+Read dates and timestamps:                Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+read timestamp text from files                     2097           2137          41          4.8         209.7       1.0X
+read timestamps from files                        20438          20451          11          0.5        2043.8       0.1X
+infer timestamps from files                       41694          41770          66          0.2        4169.4       0.1X
+read date text from files                          1832           1847          16          5.5         183.2       1.1X
+read date from files                              13796          13837          49          0.7        1379.6       0.2X
+timestamp strings                                  3213           3233          26          3.1         321.3       0.7X
+parse timestamps from Dataset[String]             22686          22743          53          0.4        2268.6       0.1X
+infer timestamps from Dataset[String]             45301          45368          58          0.2        4530.1       0.0X
+date strings                                       3431           3439           7          2.9         343.1       0.6X
+parse dates from Dataset[String]                  17688          17734          41          0.6        1768.8       0.1X
+from_json(timestamp)                              33439          33456          24          0.3        3343.9       0.1X
+from_json(date)                                   24055          24164         107          0.4        2405.5       0.1X
+
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
index f9e867b..f486e60 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
@@ -16,10 +16,13 @@
  */
 package org.apache.spark.sql.execution.datasources.json
 
+import java.io.File
+import java.time.{Instant, LocalDate}
+
 import org.apache.spark.benchmark.Benchmark
 import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark
-import org.apache.spark.sql.functions.{from_json, get_json_object, json_tuple, lit}
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types._
 
 /**
@@ -374,6 +377,126 @@ object JSONBenchmark extends SqlBasedBenchmark {
     }
   }
 
+  private def datetimeBenchmark(rowsNum: Int, numIters: Int): Unit = {
+    def timestamps = {
+      spark.range(0, rowsNum, 1, 1).mapPartitions { iter =>
+        iter.map(Instant.ofEpochSecond(_))
+      }.select($"value".as("timestamp"))
+    }
+
+    def dates = {
+      spark.range(0, rowsNum, 1, 1).mapPartitions { iter =>
+        iter.map(d => LocalDate.ofEpochDay(d % (100 * 365)))
+      }.select($"value".as("date"))
+    }
+
+    withTempPath { path =>
+
+      val timestampDir = new File(path, "timestamp").getAbsolutePath
+      val dateDir = new File(path, "date").getAbsolutePath
+
+      val writeBench = new Benchmark("Write dates and timestamps", rowsNum, output = output)
+      writeBench.addCase(s"Create a dataset of timestamps", numIters) { _ =>
+        run(timestamps)
+      }
+
+      writeBench.addCase("to_json(timestamp)", numIters) { _ =>
+        run(timestamps.select(to_json(struct($"timestamp"))))
+      }
+
+      writeBench.addCase("write timestamps to files", numIters) { _ =>
+        timestamps.write.option("header", true).mode("overwrite").json(timestampDir)
+      }
+
+      writeBench.addCase("Create a dataset of dates", numIters) { _ =>
+        run(dates)
+      }
+
+      writeBench.addCase("to_json(date)", numIters) { _ =>
+        run(dates.select(to_json(struct($"date"))))
+      }
+
+      writeBench.addCase("write dates to files", numIters) { _ =>
+        dates.write.option("header", true).mode("overwrite").json(dateDir)
+      }
+
+      writeBench.run()
+
+      val readBench = new Benchmark("Read dates and timestamps", rowsNum, output = output)
+      val tsSchema = new StructType().add("timestamp", TimestampType)
+
+      readBench.addCase("read timestamp text from files", numIters) { _ =>
+        run(spark.read.text(timestampDir))
+      }
+
+      readBench.addCase("read timestamps from files", numIters) { _ =>
+        run(spark.read.schema(tsSchema).json(timestampDir))
+      }
+
+      readBench.addCase("infer timestamps from files", numIters) { _ =>
+        run(spark.read.json(timestampDir))
+      }
+
+      val dateSchema = new StructType().add("date", DateType)
+
+      readBench.addCase("read date text from files", numIters) { _ =>
+        run(spark.read.text(dateDir))
+      }
+
+      readBench.addCase("read date from files", numIters) { _ =>
+        run(spark.read.schema(dateSchema).json(dateDir))
+      }
+
+      def timestampStr: Dataset[String] = {
+        spark.range(0, rowsNum, 1, 1).mapPartitions { iter =>
+          iter.map(i => s"""{"timestamp":"1970-01-01T01:02:03.${100 + i % 100}Z"}""")
+        }.select($"value".as("timestamp")).as[String]
+      }
+
+      readBench.addCase("timestamp strings", numIters) { _ =>
+        run(timestampStr)
+      }
+
+      readBench.addCase("parse timestamps from Dataset[String]", numIters) { _ =>
+        run(spark.read.schema(tsSchema).json(timestampStr))
+      }
+
+      readBench.addCase("infer timestamps from Dataset[String]", numIters) { _ =>
+        run(spark.read.json(timestampStr))
+      }
+
+      def dateStr: Dataset[String] = {
+        spark.range(0, rowsNum, 1, 1).mapPartitions { iter =>
+          iter.map(i => s"""{"date":"${LocalDate.ofEpochDay(i % 1000 * 365).toString}"}""")
+        }.select($"value".as("date")).as[String]
+      }
+
+      readBench.addCase("date strings", numIters) { _ =>
+        run(dateStr)
+      }
+
+      readBench.addCase("parse dates from Dataset[String]", numIters) { _ =>
+        val ds = spark.read
+          .option("header", false)
+          .schema(dateSchema)
+          .json(dateStr)
+        run(ds)
+      }
+
+      readBench.addCase("from_json(timestamp)", numIters) { _ =>
+        val ds = timestampStr.select(from_json($"timestamp", tsSchema, Map.empty[String, String]))
+        run(ds)
+      }
+
+      readBench.addCase("from_json(date)", numIters) { _ =>
+        val ds = dateStr.select(from_json($"date", dateSchema, Map.empty[String, String]))
+        run(ds)
+      }
+
+      readBench.run()
+    }
+  }
+
   override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
     val numIters = 3
     runBenchmark("Benchmark for performance of JSON parsing") {
@@ -386,6 +509,7 @@ object JSONBenchmark extends SqlBasedBenchmark {
       jsonFunctions(10 * 1000 * 1000, numIters)
       jsonInDS(50 * 1000 * 1000, numIters)
       jsonInFile(50 * 1000 * 1000, numIters)
+      datetimeBenchmark(rowsNum = 10 * 1000 * 1000, numIters)
     }
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org