You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2020/06/09 05:04:00 UTC
[spark] branch branch-3.0 updated: [SPARK-31932][SQL][TESTS] Add
date/timestamp benchmarks for `HiveResult.hiveResultString()`
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 2a9280c [SPARK-31932][SQL][TESTS] Add date/timestamp benchmarks for `HiveResult.hiveResultString()`
2a9280c is described below
commit 2a9280ca4a6610bec0453ced7ed12174f8f43e5e
Author: Max Gekk <ma...@gmail.com>
AuthorDate: Tue Jun 9 04:59:41 2020 +0000
[SPARK-31932][SQL][TESTS] Add date/timestamp benchmarks for `HiveResult.hiveResultString()`
### What changes were proposed in this pull request?
Add benchmarks for `HiveResult.hiveResultString()/toHiveString()` to measure throughput of `toHiveString` for the date/timestamp types:
- java.sql.Date/Timestamp
- java.time.Instant
- java.time.LocalDate
Benchmark results were generated in the environment:
| Item | Description |
| ---- | ----|
| Region | us-west-2 (Oregon) |
| Instance | r3.xlarge |
| AMI | ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190722.1 (ami-06f2f779464715dc5) |
| Java | OpenJDK 64-Bit Server VM 1.8.0_242 and OpenJDK 64-Bit Server VM 11.0.6+10 |
### Why are the changes needed?
To detect perf regressions of `toHiveString` in the future.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
By running `DateTimeBenchmark` and check dataset content.
Closes #28757 from MaxGekk/benchmark-toHiveString.
Authored-by: Max Gekk <ma...@gmail.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
(cherry picked from commit ddd8d5f5a0b6db17babc201ba4b73f7df91df1a3)
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
.../benchmarks/DateTimeBenchmark-jdk11-results.txt | 4 ++
sql/core/benchmarks/DateTimeBenchmark-results.txt | 4 ++
.../execution/benchmark/DateTimeBenchmark.scala | 46 ++++++++++++++++++----
3 files changed, 46 insertions(+), 8 deletions(-)
diff --git a/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt
index f4ed8ce..70d8882 100644
--- a/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt
@@ -453,5 +453,9 @@ From java.time.Instant 325 328
Collect longs 1300 1321 25 3.8 260.0 0.3X
Collect java.sql.Timestamp 1450 1557 102 3.4 290.0 0.3X
Collect java.time.Instant 1499 1599 87 3.3 299.9 0.3X
+java.sql.Date to Hive string 17536 18367 1059 0.3 3507.2 0.0X
+java.time.LocalDate to Hive string 12089 12897 725 0.4 2417.8 0.0X
+java.sql.Timestamp to Hive string 48014 48625 752 0.1 9602.9 0.0X
+java.time.Instant to Hive string 37346 37445 93 0.1 7469.1 0.0X
diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt b/sql/core/benchmarks/DateTimeBenchmark-results.txt
index 7a9aa4b..0795f11 100644
--- a/sql/core/benchmarks/DateTimeBenchmark-results.txt
+++ b/sql/core/benchmarks/DateTimeBenchmark-results.txt
@@ -453,5 +453,9 @@ From java.time.Instant 236 243
Collect longs 1280 1337 79 3.9 256.1 0.3X
Collect java.sql.Timestamp 1485 1501 15 3.4 297.0 0.3X
Collect java.time.Instant 1441 1465 37 3.5 288.1 0.3X
+java.sql.Date to Hive string 18745 20895 1364 0.3 3749.0 0.0X
+java.time.LocalDate to Hive string 15296 15450 143 0.3 3059.2 0.0X
+java.sql.Timestamp to Hive string 46421 47210 946 0.1 9284.2 0.0X
+java.time.Instant to Hive string 34747 35187 382 0.1 6949.4 0.0X
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala
index f56efa3..c7b8737 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala
@@ -21,8 +21,10 @@ import java.sql.{Date, Timestamp}
import java.time.{Instant, LocalDate}
import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.sql.Dataset
import org.apache.spark.sql.catalyst.util.DateTimeConstants.MILLIS_PER_DAY
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{withDefaultTimeZone, LA}
+import org.apache.spark.sql.execution.HiveResult
import org.apache.spark.sql.internal.SQLConf
/**
@@ -182,14 +184,19 @@ object DateTimeBenchmark extends SqlBasedBenchmark {
benchmark.addCase("From java.time.LocalDate", numIters) { _ =>
spark.range(rowsNum).map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY)).noop()
}
+ def dates = {
+ spark.range(0, rowsNum, 1, 1).map(millis => new Date(millis))
+ }
benchmark.addCase("Collect java.sql.Date", numIters) { _ =>
- spark.range(0, rowsNum, 1, 1).map(millis => new Date(millis)).collect()
+ dates.collect()
+ }
+ def localDates = {
+ spark.range(0, rowsNum, 1, 1)
+ .map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY))
}
benchmark.addCase("Collect java.time.LocalDate", numIters) { _ =>
withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
- spark.range(0, rowsNum, 1, 1)
- .map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY))
- .collect()
+ localDates.collect()
}
}
benchmark.addCase("From java.sql.Timestamp", numIters) { _ =>
@@ -202,14 +209,37 @@ object DateTimeBenchmark extends SqlBasedBenchmark {
spark.range(0, rowsNum, 1, 1)
.collect()
}
+ def timestamps = {
+ spark.range(0, rowsNum, 1, 1).map(millis => new Timestamp(millis))
+ }
benchmark.addCase("Collect java.sql.Timestamp", numIters) { _ =>
- spark.range(0, rowsNum, 1, 1).map(millis => new Timestamp(millis)).collect()
+ timestamps.collect()
+ }
+ def instants = {
+ spark.range(0, rowsNum, 1, 1).map(millis => Instant.ofEpochMilli(millis))
}
benchmark.addCase("Collect java.time.Instant", numIters) { _ =>
withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
- spark.range(0, rowsNum, 1, 1)
- .map(millis => Instant.ofEpochMilli(millis))
- .collect()
+ instants.collect()
+ }
+ }
+ def toHiveString(df: Dataset[_]): Unit = {
+ HiveResult.hiveResultString(df.queryExecution.executedPlan)
+ }
+ benchmark.addCase("java.sql.Date to Hive string", numIters) { _ =>
+ toHiveString(dates)
+ }
+ benchmark.addCase("java.time.LocalDate to Hive string", numIters) { _ =>
+ withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
+ toHiveString(localDates)
+ }
+ }
+ benchmark.addCase("java.sql.Timestamp to Hive string", numIters) { _ =>
+ toHiveString(timestamps)
+ }
+ benchmark.addCase("java.time.Instant to Hive string", numIters) { _ =>
+ withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") {
+ toHiveString(instants)
}
}
benchmark.run()
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org