You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by ya...@apache.org on 2020/04/16 08:55:20 UTC
[spark] branch branch-3.0 updated: [SPARK-30564][SQL] Revert Block.length and use comment placeholders in HashAggregateExec

This is an automated email from the ASF dual-hosted git repository.

yamamuro pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 020f3a3  [SPARK-30564][SQL] Revert Block.length and use comment placeholders in HashAggregateExec
020f3a3 is described below

commit 020f3a33dd711d05337bb42d5f65708a4aec2daa
Author: Peter Toth <pe...@gmail.com>
AuthorDate: Thu Apr 16 17:52:22 2020 +0900

    [SPARK-30564][SQL] Revert Block.length and use comment placeholders in HashAggregateExec
    
    ### What changes were proposed in this pull request?
    SPARK-21870 (cb0cddf#diff-06dc5de6163687b7810aa76e7e152a76R146-R149) caused significant performance regression in cases where the source code size is fairly large as `HashAggregateExec` uses `Block.length` to decide on splitting the code. The change in `length` makes sense as the comment and extra new lines shouldn't be taken into account when deciding on splitting, but the regular expression based approach is very slow and adds a big relative overhead to cases where the execution is  [...]
    This PR:
    - restores `Block.length` to its original form
    - places comments in `HashAggragateExec` with `CodegenContext.registerComment` so as to appear only when comments are enabled (`spark.sql.codegen.comments=true`)
    
    Before this PR:
    ```
    deeply nested struct field r/w:           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
    ------------------------------------------------------------------------------------------------------------------------
    250 deep x 400 rows (read in-mem)                  1137           1143           8          0.1       11368.3       0.0X
    ```
    
    After this PR:
    ```
    deeply nested struct field r/w:           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
    ------------------------------------------------------------------------------------------------------------------------
    250 deep x 400 rows (read in-mem)                   167            180           7          0.6        1674.3       0.1X
    ```
    ### Why are the changes needed?
    To fix performance regression.
    
    ### Does this PR introduce any user-facing change?
    No.
    
    ### How was this patch tested?
    Existing UTs.
    
    Closes #28083 from peter-toth/SPARK-30564-use-comment-placeholders.
    
    Authored-by: Peter Toth <pe...@gmail.com>
    Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
    
    (cherry picked from commit 7ad6ba36f28b7a5ca548950dec6afcd61e5d68b9)
    
    Signed-off-by: Takeshi Yamamuro <ya...@apache.org>
---
 .../spark/sql/catalyst/expressions/codegen/javaCode.scala  |  8 ++++----
 .../spark/sql/execution/aggregate/HashAggregateExec.scala  | 14 +++++++-------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
index dff2589..1c59c3c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/javaCode.scala
@@ -143,10 +143,10 @@ trait Block extends TreeNode[Block] with JavaCode {
     case _ => code.trim
   }
 
-  def length: Int = {
-    // Returns a code length without comments
-    CodeFormatter.stripExtraNewLinesAndComments(toString).length
-  }
+  // We could remove comments, extra whitespaces and newlines when calculating length as it is used
+  // only for codegen method splitting, but SPARK-30564 showed that this is a performance critical
+  // function so we decided not to do so.
+  def length: Int = toString.length
 
   def isEmpty: Boolean = toString.isEmpty
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
index 7a26fd7..87a4081 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -367,10 +367,10 @@ case class HashAggregateExec(
          """.stripMargin
       }
       code"""
-         |// do aggregate for ${aggNames(i)}
-         |// evaluate aggregate function
+         |${ctx.registerComment(s"do aggregate for ${aggNames(i)}")}
+         |${ctx.registerComment("evaluate aggregate function")}
          |${evaluateVariables(bufferEvalsForOneFunc)}
-         |// update aggregation buffers
+         |${ctx.registerComment("update aggregation buffers")}
          |${updates.mkString("\n").trim}
        """.stripMargin
     }
@@ -975,9 +975,9 @@ case class HashAggregateExec(
           CodeGenerator.updateColumn(unsafeRowBuffer, dt, bufferOffset + j, ev, nullable)
         }
         code"""
-           |// evaluate aggregate function for ${aggNames(i)}
+           |${ctx.registerComment(s"evaluate aggregate function for ${aggNames(i)}")}
            |${evaluateVariables(rowBufferEvalsForOneFunc)}
-           |// update unsafe row buffer
+           |${ctx.registerComment("update unsafe row buffer")}
            |${updateRowBuffers.mkString("\n").trim}
          """.stripMargin
       }
@@ -1030,9 +1030,9 @@ case class HashAggregateExec(
                 isVectorized = true)
             }
             code"""
-               |// evaluate aggregate function for ${aggNames(i)}
+               |${ctx.registerComment(s"evaluate aggregate function for ${aggNames(i)}")}
                |${evaluateVariables(fastRowEvalsForOneFunc)}
-               |// update fast row
+               |${ctx.registerComment("update fast row")}
                |${updateRowBuffer.mkString("\n").trim}
              """.stripMargin
           }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org