You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ya...@apache.org on 2020/04/16 08:55:20 UTC
[spark] branch branch-3.0 updated: [SPARK-30564][SQL] Revert
Block.length and use comment placeholders in HashAggregateExec
This is an automated email from the ASF dual-hosted git repository.
yamamuro pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 020f3a3 [SPARK-30564][SQL] Revert Block.length and use comment placeholders in HashAggregateExec
020f3a3 is described below
commit 020f3a33dd711d05337bb42d5f65708a4aec2daa
Author: Peter Toth <pe...@gmail.com>
AuthorDate: Thu Apr 16 17:52:22 2020 +0900
[SPARK-30564][SQL] Revert Block.length and use comment placeholders in HashAggregateExec
### What changes were proposed in this pull request?
SPARK-21870 (cb0cddf#diff-06dc5de6163687b7810aa76e7e152a76R146-R149) caused significant performance regression in cases where the source code size is fairly large as `HashAggregateExec` uses `Block.length` to decide on splitting the code. The change in `length` makes sense as the comment and extra new lines shouldn't be taken into account when deciding on splitting, but the regular expression based approach is very slow and adds a big relative overhead to cases where the execution is [...]
This PR:
- restores `Block.length` to its original form
- places comments in `HashAggragateExec` with `CodegenContext.registerComment` so as to appear only when comments are enabled (`spark.sql.codegen.comments=true`)
Before this PR:
```
deeply nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
250 deep x 400 rows (read in-mem) 1137 1143 8 0.1 11368.3 0.0X
```
After this PR:
```
deeply nested struct field r/w: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------------------------------
250 deep x 400 rows (read in-mem) 167 180 7 0.6 1674.3 0.1X
```
### Why are the changes needed?
To fix performance regression.
### Does this PR introduce any user-facing change?
No.
### How was this patch tested?
Existing UTs.
Closes #28083 from peter-toth/SPARK-30564-use-comment-placeholders.
Authored-by: Peter Toth <pe...@gmail.com>
Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
(cherry picked from commit 7ad6ba36f28b7a5ca548950dec6afcd61e5d68b9)
Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
---
.../spark/sql/catalyst/expressions/codegen/javaCode.scala | 8 ++++----
.../spark/sql/execution/aggregate/HashAggregateExec.scala | 14 +++++++-------
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
index dff2589..1c59c3c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
@@ -143,10 +143,10 @@ trait Block extends TreeNode[Block] with JavaCode {
case _ => code.trim
}
- def length: Int = {
- // Returns a code length without comments
- CodeFormatter.stripExtraNewLinesAndComments(toString).length
- }
+ // We could remove comments, extra whitespaces and newlines when calculating length as it is used
+ // only for codegen method splitting, but SPARK-30564 showed that this is a performance critical
+ // function so we decided not to do so.
+ def length: Int = toString.length
def isEmpty: Boolean = toString.isEmpty
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 7a26fd7..87a4081 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -367,10 +367,10 @@ case class HashAggregateExec(
""".stripMargin
}
code"""
- |// do aggregate for ${aggNames(i)}
- |// evaluate aggregate function
+ |${ctx.registerComment(s"do aggregate for ${aggNames(i)}")}
+ |${ctx.registerComment("evaluate aggregate function")}
|${evaluateVariables(bufferEvalsForOneFunc)}
- |// update aggregation buffers
+ |${ctx.registerComment("update aggregation buffers")}
|${updates.mkString("\n").trim}
""".stripMargin
}
@@ -975,9 +975,9 @@ case class HashAggregateExec(
CodeGenerator.updateColumn(unsafeRowBuffer, dt, bufferOffset + j, ev, nullable)
}
code"""
- |// evaluate aggregate function for ${aggNames(i)}
+ |${ctx.registerComment(s"evaluate aggregate function for ${aggNames(i)}")}
|${evaluateVariables(rowBufferEvalsForOneFunc)}
- |// update unsafe row buffer
+ |${ctx.registerComment("update unsafe row buffer")}
|${updateRowBuffers.mkString("\n").trim}
""".stripMargin
}
@@ -1030,9 +1030,9 @@ case class HashAggregateExec(
isVectorized = true)
}
code"""
- |// evaluate aggregate function for ${aggNames(i)}
+ |${ctx.registerComment(s"evaluate aggregate function for ${aggNames(i)}")}
|${evaluateVariables(fastRowEvalsForOneFunc)}
- |// update fast row
+ |${ctx.registerComment("update fast row")}
|${updateRowBuffer.mkString("\n").trim}
""".stripMargin
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org