You are viewing a plain text version of this content. The canonical link for it is here.
Posted to reviews@spark.apache.org by yucai <gi...@git.apache.org> on 2018/11/01 02:00:50 UTC
[GitHub] spark pull request #22847: [SPARK-25850][SQL] Make the split threshold for t...
Github user yucai commented on a diff in the pull request:
https://github.com/apache/spark/pull/22847#discussion_r229919857
--- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala ---
@@ -812,6 +812,17 @@ object SQLConf {
.intConf
.createWithDefault(65535)
+ val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold")
+ .internal()
+ .doc("The threshold of source code length without comment of a single Java function by " +
+ "codegen to be split. When the generated Java function source code exceeds this threshold" +
+ ", it will be split into multiple small functions. We can't know how many bytecode will " +
+ "be generated, so use the code length as metric. A function's bytecode should not go " +
+ "beyond 8KB, otherwise it will not be JITted; it also should not be too small, otherwise " +
+ "there will be many function calls.")
+ .intConf
--- End diff --
Seems like long alias names have no influence.
```
[info] Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6
[info] Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
[info] projection on wide table: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
[info] ------------------------------------------------------------------------------------------------
[info] split threshold 10 6512 / 6736 0.2 6210.4 1.0X
[info] split threshold 100 5730 / 6329 0.2 5464.9 1.1X
[info] split threshold 1024 3119 / 3184 0.3 2974.6 2.1X
[info] split threshold 2048 2981 / 3100 0.4 2842.9 2.2X
[info] split threshold 4096 3289 / 3379 0.3 3136.6 2.0X
[info] split threshold 8196 4307 / 4338 0.2 4108.0 1.5X
[info] split threshold 65536 29147 / 30212 0.0 27797.0 0.2X
```
No `averylongprefixrepeatedmultipletimes` in the **expression code gen**:
```
/* 047 */ private void createExternalRow_0_8(InternalRow i, Object[] values_0) {
/* 048 */
/* 049 */ // input[80, bigint, false]
/* 050 */ long value_81 = i.getLong(80);
/* 051 */ if (false) {
/* 052 */ values_0[80] = null;
/* 053 */ } else {
/* 054 */ values_0[80] = value_81;
/* 055 */ }
/* 056 */
/* 057 */ // input[81, bigint, false]
/* 058 */ long value_82 = i.getLong(81);
/* 059 */ if (false) {
/* 060 */ values_0[81] = null;
/* 061 */ } else {
/* 062 */ values_0[81] = value_82;
/* 063 */ }
/* 064 */
/* 065 */ // input[82, bigint, false]
/* 066 */ long value_83 = i.getLong(82);
/* 067 */ if (false) {
/* 068 */ values_0[82] = null;
/* 069 */ } else {
/* 070 */ values_0[82] = value_83;
/* 071 */ }
/* 072 */
...
```
My benchmark:
```
object WideTableBenchmark extends SqlBasedBenchmark {
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
runBenchmark("projection on wide table") {
val N = 1 << 20
val df = spark.range(N)
val columns = (0 until 400).map{ i => s"id as averylongprefixrepeatedmultipletimes_id$i"}
val benchmark = new Benchmark("projection on wide table", N, output = output)
Seq("10", "100", "1024", "2048", "4096", "8196", "65536").foreach { n =>
benchmark.addCase(s"split threshold $n", numIters = 5) { iter =>
withSQLConf("spark.testing.codegen.splitThreshold" -> n) {
df.selectExpr(columns: _*).foreach(identity(_))
}
}
}
benchmark.run()
}
}
}
```
Will keep benchmarking for the complex expression.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscribe@spark.apache.org
For additional commands, e-mail: reviews-help@spark.apache.org