You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2018/09/29 16:48:11 UTC
spark git commit: [SPARK-25508][SQL][TEST] Refactor OrcReadBenchmark
to use main method
Repository: spark
Updated Branches:
refs/heads/master 623c2ec4e -> f246813af
[SPARK-25508][SQL][TEST] Refactor OrcReadBenchmark to use main method
## What changes were proposed in this pull request?
Refactor OrcReadBenchmark to use main method.
Generate benchmark result:
```
SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "hive/test:runMain org.apache.spark.sql.hive.orc.OrcReadBenchmark"
```
## How was this patch tested?
manual tests
Closes #22580 from yucai/SPARK-25508.
Lead-authored-by: yucai <yy...@ebay.com>
Co-authored-by: Yucai Yu <yu...@foxmail.com>
Co-authored-by: Dongjoon Hyun <do...@apache.org>
Signed-off-by: Dongjoon Hyun <do...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f246813a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f246813a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f246813a
Branch: refs/heads/master
Commit: f246813afba16fee4d703f09e6302011b11806f3
Parents: 623c2ec
Author: yucai <yy...@ebay.com>
Authored: Sat Sep 29 09:48:03 2018 -0700
Committer: Dongjoon Hyun <do...@apache.org>
Committed: Sat Sep 29 09:48:03 2018 -0700
----------------------------------------------------------------------
.../benchmarks/OrcReadBenchmark-results.txt | 173 ++++++++++++++++
.../spark/sql/hive/orc/OrcReadBenchmark.scala | 196 ++++---------------
2 files changed, 212 insertions(+), 157 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/spark/blob/f246813a/sql/hive/benchmarks/OrcReadBenchmark-results.txt
----------------------------------------------------------------------
diff --git a/sql/hive/benchmarks/OrcReadBenchmark-results.txt b/sql/hive/benchmarks/OrcReadBenchmark-results.txt
new file mode 100644
index 0000000..c77f966
--- /dev/null
+++ b/sql/hive/benchmarks/OrcReadBenchmark-results.txt
@@ -0,0 +1,173 @@
+================================================================================================
+SQL Single Numeric Column Scan
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single TINYINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 1630 / 1639 9.7 103.6 1.0X
+Native ORC Vectorized 253 / 288 62.2 16.1 6.4X
+Native ORC Vectorized with copy 227 / 244 69.2 14.5 7.2X
+Hive built-in ORC 1980 / 1991 7.9 125.9 0.8X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single SMALLINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 1587 / 1589 9.9 100.9 1.0X
+Native ORC Vectorized 227 / 242 69.2 14.5 7.0X
+Native ORC Vectorized with copy 228 / 238 69.0 14.5 7.0X
+Hive built-in ORC 2323 / 2332 6.8 147.7 0.7X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single INT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 1726 / 1771 9.1 109.7 1.0X
+Native ORC Vectorized 309 / 333 50.9 19.7 5.6X
+Native ORC Vectorized with copy 313 / 321 50.2 19.9 5.5X
+Hive built-in ORC 2668 / 2672 5.9 169.6 0.6X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single BIGINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 1722 / 1747 9.1 109.5 1.0X
+Native ORC Vectorized 395 / 403 39.8 25.1 4.4X
+Native ORC Vectorized with copy 399 / 405 39.4 25.4 4.3X
+Hive built-in ORC 2767 / 2777 5.7 175.9 0.6X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single FLOAT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 1797 / 1824 8.8 114.2 1.0X
+Native ORC Vectorized 434 / 441 36.2 27.6 4.1X
+Native ORC Vectorized with copy 437 / 447 36.0 27.8 4.1X
+Hive built-in ORC 2701 / 2710 5.8 171.7 0.7X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+SQL Single DOUBLE Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 1931 / 2028 8.1 122.8 1.0X
+Native ORC Vectorized 542 / 557 29.0 34.5 3.6X
+Native ORC Vectorized with copy 550 / 564 28.6 35.0 3.5X
+Hive built-in ORC 2816 / 3206 5.6 179.1 0.7X
+
+
+================================================================================================
+Int and String Scan
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Int and String Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 4012 / 4068 2.6 382.6 1.0X
+Native ORC Vectorized 2337 / 2339 4.5 222.9 1.7X
+Native ORC Vectorized with copy 2520 / 2540 4.2 240.3 1.6X
+Hive built-in ORC 5503 / 5575 1.9 524.8 0.7X
+
+
+================================================================================================
+Partitioned Table Scan
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Partitioned Table: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Data column - Native ORC MR 2020 / 2025 7.8 128.4 1.0X
+Data column - Native ORC Vectorized 398 / 409 39.5 25.3 5.1X
+Data column - Native ORC Vectorized with copy 406 / 411 38.8 25.8 5.0X
+Data column - Hive built-in ORC 2967 / 2969 5.3 188.6 0.7X
+Partition column - Native ORC MR 1494 / 1505 10.5 95.0 1.4X
+Partition column - Native ORC Vectorized 73 / 82 216.3 4.6 27.8X
+Partition column - Native ORC Vectorized with copy 71 / 80 221.4 4.5 28.4X
+Partition column - Hive built-in ORC 1932 / 1937 8.1 122.8 1.0X
+Both columns - Native ORC MR 2057 / 2071 7.6 130.8 1.0X
+Both columns - Native ORC Vectorized 445 / 448 35.4 28.3 4.5X
+Both column - Native ORC Vectorized with copy 534 / 539 29.4 34.0 3.8X
+Both columns - Hive built-in ORC 2994 / 2994 5.3 190.3 0.7X
+
+
+================================================================================================
+Repeated String Scan
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Repeated String: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 1771 / 1785 5.9 168.9 1.0X
+Native ORC Vectorized 372 / 375 28.2 35.5 4.8X
+Native ORC Vectorized with copy 543 / 576 19.3 51.8 3.3X
+Hive built-in ORC 2671 / 2671 3.9 254.7 0.7X
+
+
+================================================================================================
+String with Nulls Scan
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+String with Nulls Scan (0.0%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 3276 / 3302 3.2 312.5 1.0X
+Native ORC Vectorized 1057 / 1080 9.9 100.8 3.1X
+Native ORC Vectorized with copy 1420 / 1431 7.4 135.4 2.3X
+Hive built-in ORC 5377 / 5407 2.0 512.8 0.6X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+String with Nulls Scan (0.5%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 3147 / 3147 3.3 300.1 1.0X
+Native ORC Vectorized 1305 / 1319 8.0 124.4 2.4X
+Native ORC Vectorized with copy 1685 / 1686 6.2 160.7 1.9X
+Hive built-in ORC 4077 / 4085 2.6 388.8 0.8X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+String with Nulls Scan (0.95%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 1739 / 1744 6.0 165.8 1.0X
+Native ORC Vectorized 500 / 501 21.0 47.7 3.5X
+Native ORC Vectorized with copy 618 / 631 17.0 58.9 2.8X
+Hive built-in ORC 2411 / 2427 4.3 229.9 0.7X
+
+
+================================================================================================
+Single Column Scan From Wide Columns
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Single Column Scan from 100 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 1348 / 1366 0.8 1285.3 1.0X
+Native ORC Vectorized 119 / 134 8.8 113.5 11.3X
+Native ORC Vectorized with copy 119 / 148 8.8 113.9 11.3X
+Hive built-in ORC 487 / 507 2.2 464.8 2.8X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Single Column Scan from 200 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 2667 / 2837 0.4 2543.6 1.0X
+Native ORC Vectorized 203 / 222 5.2 193.4 13.2X
+Native ORC Vectorized with copy 217 / 255 4.8 207.0 12.3X
+Hive built-in ORC 737 / 741 1.4 702.4 3.6X
+
+OpenJDK 64-Bit Server VM 1.8.0_181-b13 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Single Column Scan from 300 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
+------------------------------------------------------------------------------------------------
+Native ORC MR 3954 / 3956 0.3 3770.4 1.0X
+Native ORC Vectorized 348 / 360 3.0 331.7 11.4X
+Native ORC Vectorized with copy 349 / 359 3.0 333.2 11.3X
+Hive built-in ORC 1057 / 1067 1.0 1008.0 3.7X
+
+
http://git-wip-us.apache.org/repos/asf/spark/blob/f246813a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
----------------------------------------------------------------------
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
index 49de007..0bb5e8c 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcReadBenchmark.scala
@@ -22,20 +22,26 @@ import java.io.File
import scala.util.Random
import org.apache.spark.SparkConf
-import org.apache.spark.benchmark.Benchmark
+import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.catalyst.plans.SQLHelper
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
-
/**
* Benchmark to measure ORC read performance.
+ * {{{
+ * To run this benchmark:
+ * 1. without sbt: bin/spark-submit --class <this class> <spark sql test jar>
+ * 2. build/sbt "sql/test:runMain <this class>"
+ * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
+ * Results will be written to "benchmarks/OrcReadBenchmark-results.txt".
+ * }}}
*
* This is in `sql/hive` module in order to compare `sql/core` and `sql/hive` ORC data sources.
*/
// scalastyle:off line.size.limit
-object OrcReadBenchmark extends SQLHelper {
+object OrcReadBenchmark extends BenchmarkBase with SQLHelper {
val conf = new SparkConf()
conf.set("orc.compression", "snappy")
@@ -69,7 +75,7 @@ object OrcReadBenchmark extends SQLHelper {
}
def numericScanBenchmark(values: Int, dataType: DataType): Unit = {
- val benchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values)
+ val benchmark = new Benchmark(s"SQL Single ${dataType.sql} Column Scan", values, output = output)
withTempPath { dir =>
withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
@@ -98,59 +104,13 @@ object OrcReadBenchmark extends SQLHelper {
spark.sql("SELECT sum(id) FROM hiveOrcTable").collect()
}
- /*
- Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
- Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
-
- SQL Single TINYINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 1135 / 1171 13.9 72.2 1.0X
- Native ORC Vectorized 152 / 163 103.4 9.7 7.5X
- Native ORC Vectorized with copy 149 / 162 105.4 9.5 7.6X
- Hive built-in ORC 1380 / 1384 11.4 87.7 0.8X
-
- SQL Single SMALLINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 1182 / 1244 13.3 75.2 1.0X
- Native ORC Vectorized 145 / 156 108.7 9.2 8.2X
- Native ORC Vectorized with copy 148 / 158 106.4 9.4 8.0X
- Hive built-in ORC 1591 / 1636 9.9 101.2 0.7X
-
- SQL Single INT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 1271 / 1271 12.4 80.8 1.0X
- Native ORC Vectorized 206 / 212 76.3 13.1 6.2X
- Native ORC Vectorized with copy 200 / 213 78.8 12.7 6.4X
- Hive built-in ORC 1776 / 1787 8.9 112.9 0.7X
-
- SQL Single BIGINT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 1344 / 1355 11.7 85.4 1.0X
- Native ORC Vectorized 258 / 268 61.0 16.4 5.2X
- Native ORC Vectorized with copy 252 / 257 62.4 16.0 5.3X
- Hive built-in ORC 1818 / 1823 8.7 115.6 0.7X
-
- SQL Single FLOAT Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 1333 / 1352 11.8 84.8 1.0X
- Native ORC Vectorized 310 / 324 50.7 19.7 4.3X
- Native ORC Vectorized with copy 312 / 320 50.4 19.9 4.3X
- Hive built-in ORC 1904 / 1918 8.3 121.0 0.7X
-
- SQL Single DOUBLE Column Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 1408 / 1585 11.2 89.5 1.0X
- Native ORC Vectorized 359 / 368 43.8 22.8 3.9X
- Native ORC Vectorized with copy 364 / 371 43.2 23.2 3.9X
- Hive built-in ORC 1881 / 1954 8.4 119.6 0.7X
- */
benchmark.run()
}
}
}
def intStringScanBenchmark(values: Int): Unit = {
- val benchmark = new Benchmark("Int and String Scan", values)
+ val benchmark = new Benchmark("Int and String Scan", values, output = output)
withTempPath { dir =>
withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
@@ -181,24 +141,13 @@ object OrcReadBenchmark extends SQLHelper {
spark.sql("SELECT sum(c1), sum(length(c2)) FROM hiveOrcTable").collect()
}
- /*
- Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
- Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
-
- Int and String Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 2566 / 2592 4.1 244.7 1.0X
- Native ORC Vectorized 1098 / 1113 9.6 104.7 2.3X
- Native ORC Vectorized with copy 1527 / 1593 6.9 145.6 1.7X
- Hive built-in ORC 3561 / 3705 2.9 339.6 0.7X
- */
benchmark.run()
}
}
}
def partitionTableScanBenchmark(values: Int): Unit = {
- val benchmark = new Benchmark("Partitioned Table", values)
+ val benchmark = new Benchmark("Partitioned Table", values, output = output)
withTempPath { dir =>
withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
@@ -267,32 +216,13 @@ object OrcReadBenchmark extends SQLHelper {
spark.sql("SELECT sum(p), sum(id) FROM hiveOrcTable").collect()
}
- /*
- Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
- Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
-
- Partitioned Table: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Data only - Native ORC MR 1447 / 1457 10.9 92.0 1.0X
- Data only - Native ORC Vectorized 256 / 266 61.4 16.3 5.6X
- Data only - Native ORC Vectorized with copy 263 / 273 59.8 16.7 5.5X
- Data only - Hive built-in ORC 1960 / 1988 8.0 124.6 0.7X
- Partition only - Native ORC MR 1039 / 1043 15.1 66.0 1.4X
- Partition only - Native ORC Vectorized 48 / 53 326.6 3.1 30.1X
- Partition only - Native ORC Vectorized with copy 48 / 53 328.4 3.0 30.2X
- Partition only - Hive built-in ORC 1234 / 1242 12.7 78.4 1.2X
- Both columns - Native ORC MR 1465 / 1475 10.7 93.1 1.0X
- Both columns - Native ORC Vectorized 292 / 301 53.9 18.6 5.0X
- Both column - Native ORC Vectorized with copy 348 / 354 45.1 22.2 4.2X
- Both columns - Hive built-in ORC 2051 / 2060 7.7 130.4 0.7X
- */
benchmark.run()
}
}
}
def repeatedStringScanBenchmark(values: Int): Unit = {
- val benchmark = new Benchmark("Repeated String", values)
+ val benchmark = new Benchmark("Repeated String", values, output = output)
withTempPath { dir =>
withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
@@ -320,17 +250,6 @@ object OrcReadBenchmark extends SQLHelper {
spark.sql("SELECT sum(length(c1)) FROM hiveOrcTable").collect()
}
- /*
- Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
- Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
-
- Repeated String: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 1271 / 1278 8.3 121.2 1.0X
- Native ORC Vectorized 200 / 212 52.4 19.1 6.4X
- Native ORC Vectorized with copy 342 / 347 30.7 32.6 3.7X
- Hive built-in ORC 1874 / 2105 5.6 178.7 0.7X
- */
benchmark.run()
}
}
@@ -347,7 +266,8 @@ object OrcReadBenchmark extends SQLHelper {
s"SELECT IF(RAND(1) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c1, " +
s"IF(RAND(2) < $fractionOfNulls, NULL, CAST(id as STRING)) AS c2 FROM t1"))
- val benchmark = new Benchmark(s"String with Nulls Scan ($fractionOfNulls%)", values)
+ val benchmark =
+ new Benchmark(s"String with Nulls Scan ($fractionOfNulls%)", values, output = output)
benchmark.addCase("Native ORC MR") { _ =>
withSQLConf(SQLConf.ORC_VECTORIZED_READER_ENABLED.key -> "false") {
@@ -373,38 +293,13 @@ object OrcReadBenchmark extends SQLHelper {
"WHERE c1 IS NOT NULL AND c2 IS NOT NULL").collect()
}
- /*
- Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
- Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
-
- String with Nulls Scan (0.0%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 2394 / 2886 4.4 228.3 1.0X
- Native ORC Vectorized 699 / 729 15.0 66.7 3.4X
- Native ORC Vectorized with copy 959 / 1025 10.9 91.5 2.5X
- Hive built-in ORC 3899 / 3901 2.7 371.9 0.6X
-
- String with Nulls Scan (0.5%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 2234 / 2255 4.7 213.1 1.0X
- Native ORC Vectorized 854 / 869 12.3 81.4 2.6X
- Native ORC Vectorized with copy 1099 / 1128 9.5 104.8 2.0X
- Hive built-in ORC 2767 / 2793 3.8 263.9 0.8X
-
- String with Nulls Scan (0.95%): Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 1166 / 1202 9.0 111.2 1.0X
- Native ORC Vectorized 338 / 345 31.1 32.2 3.5X
- Native ORC Vectorized with copy 418 / 428 25.1 39.9 2.8X
- Hive built-in ORC 1730 / 1761 6.1 164.9 0.7X
- */
benchmark.run()
}
}
}
def columnsBenchmark(values: Int, width: Int): Unit = {
- val benchmark = new Benchmark(s"Single Column Scan from $width columns", values)
+ val benchmark = new Benchmark(s"Single Column Scan from $width columns", values, output = output)
withTempPath { dir =>
withTempTable("t1", "nativeOrcTable", "hiveOrcTable") {
@@ -436,49 +331,36 @@ object OrcReadBenchmark extends SQLHelper {
spark.sql(s"SELECT sum(c$middle) FROM hiveOrcTable").collect()
}
- /*
- Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.13.1
- Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
-
- Single Column Scan from 100 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 1050 / 1053 1.0 1001.1 1.0X
- Native ORC Vectorized 95 / 101 11.0 90.9 11.0X
- Native ORC Vectorized with copy 95 / 102 11.0 90.9 11.0X
- Hive built-in ORC 348 / 358 3.0 331.8 3.0X
-
- Single Column Scan from 200 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 2099 / 2108 0.5 2002.1 1.0X
- Native ORC Vectorized 179 / 187 5.8 171.1 11.7X
- Native ORC Vectorized with copy 176 / 188 6.0 167.6 11.9X
- Hive built-in ORC 562 / 581 1.9 535.9 3.7X
-
- Single Column Scan from 300 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
- ------------------------------------------------------------------------------------------------
- Native ORC MR 3221 / 3246 0.3 3071.4 1.0X
- Native ORC Vectorized 312 / 322 3.4 298.0 10.3X
- Native ORC Vectorized with copy 306 / 320 3.4 291.6 10.5X
- Hive built-in ORC 815 / 824 1.3 777.3 4.0X
- */
benchmark.run()
}
}
}
- def main(args: Array[String]): Unit = {
- Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { dataType =>
- numericScanBenchmark(1024 * 1024 * 15, dataType)
+ override def benchmark(): Unit = {
+ runBenchmark("SQL Single Numeric Column Scan") {
+ Seq(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType).foreach { dataType =>
+ numericScanBenchmark(1024 * 1024 * 15, dataType)
+ }
+ }
+ runBenchmark("Int and String Scan") {
+ intStringScanBenchmark(1024 * 1024 * 10)
+ }
+ runBenchmark("Partitioned Table Scan") {
+ partitionTableScanBenchmark(1024 * 1024 * 15)
+ }
+ runBenchmark("Repeated String Scan") {
+ repeatedStringScanBenchmark(1024 * 1024 * 10)
+ }
+ runBenchmark("String with Nulls Scan") {
+ for (fractionOfNulls <- List(0.0, 0.50, 0.95)) {
+ stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls)
+ }
}
- intStringScanBenchmark(1024 * 1024 * 10)
- partitionTableScanBenchmark(1024 * 1024 * 15)
- repeatedStringScanBenchmark(1024 * 1024 * 10)
- for (fractionOfNulls <- List(0.0, 0.50, 0.95)) {
- stringWithNullsScanBenchmark(1024 * 1024 * 10, fractionOfNulls)
+ runBenchmark("Single Column Scan From Wide Columns") {
+ columnsBenchmark(1024 * 1024 * 1, 100)
+ columnsBenchmark(1024 * 1024 * 1, 200)
+ columnsBenchmark(1024 * 1024 * 1, 300)
}
- columnsBenchmark(1024 * 1024 * 1, 100)
- columnsBenchmark(1024 * 1024 * 1, 200)
- columnsBenchmark(1024 * 1024 * 1, 300)
}
}
// scalastyle:on line.size.limit
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org