You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2019/01/18 06:43:54 UTC
[spark] branch master updated: [SPARK-26659][SQL] Fix duplicate cmd.nodeName in the explain output of DataWritingCommandExec

This is an automated email from the ASF dual-hosted git repository.

lixiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new e341864  [SPARK-26659][SQL] Fix duplicate cmd.nodeName in the explain output of DataWritingCommandExec
e341864 is described below

commit e3418649dcb50f2a2fb977560d87a94c81516198
Author: Kris Mok <kr...@databricks.com>
AuthorDate: Thu Jan 17 22:43:39 2019 -0800

    [SPARK-26659][SQL] Fix duplicate cmd.nodeName in the explain output of DataWritingCommandExec
    
    ## What changes were proposed in this pull request?
    
    `DataWritingCommandExec` generates `cmd.nodeName` twice in its explain output, e.g. when running this query `spark.sql("create table foo stored as parquet as select id, id % 10 as cat1, id % 20 as cat2 from range(10)")`,
    ```
    Execute OptimizedCreateHiveTableAsSelectCommand OptimizedCreateHiveTableAsSelectCommand [Database:default, TableName: foo, InsertIntoHiveTable]
    +- *(1) Project [id#2L, (id#2L % 10) AS cat1#0L, (id#2L % 20) AS cat2#1L]
       +- *(1) Range (0, 10, step=1, splits=8)
    ```
    After the fix, it'll go back to normal:
    ```
    Execute OptimizedCreateHiveTableAsSelectCommand [Database:default, TableName: foo, InsertIntoHiveTable]
    +- *(1) Project [id#2L, (id#2L % 10) AS cat1#0L, (id#2L % 20) AS cat2#1L]
       +- *(1) Range (0, 10, step=1, splits=8)
    ```
    
    This duplication is introduced when this specialized `DataWritingCommandExec` was created in place of `ExecutedCommandExec`.
    
    The former is a `UnaryExecNode` whose `children` include the physical plan of the query, and the `cmd` is picked up via `TreeNode.stringArgs` into the argument string. The duplication comes from: `DataWritingCommandExec.nodeName` is `s"Execute ${cmd.nodeName}"` while the argument string is `cmd.simpleString()` which also includes `cmd.nodeName`.
    
    The latter didn't have that problem because it's a `LeafExecNode` with no children, and it declares the `cmd` as being a part of the `innerChildren` which is excluded from the argument string.
    
    ## How was this patch tested?
    
    Manual testing of running the example above in a local Spark Shell.
    Also added a new test case in `ExplainSuite`.
    
    Closes #23579 from rednaxelafx/fix-explain.
    
    Authored-by: Kris Mok <kr...@databricks.com>
    Signed-off-by: gatorsmile <ga...@gmail.com>
---
 .../spark/sql/execution/command/commands.scala     |  3 +++
 .../scala/org/apache/spark/sql/ExplainSuite.scala  | 28 ++++++++++++++++++----
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
index 754a331..a1f2785 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -110,6 +110,9 @@ case class DataWritingCommandExec(cmd: DataWritingCommand, child: SparkPlan)
 
   override def nodeName: String = "Execute " + cmd.nodeName
 
+  // override the default one, otherwise the `cmd.nodeName` will appear twice from simpleString
+  override def argString(maxFields: Int): String = cmd.argString(maxFields)
+
   override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray
 
   override def executeToIterator: Iterator[InternalRow] = sideEffectResult.toIterator
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
index ce47592..ec68828 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
@@ -25,16 +25,25 @@ class ExplainSuite extends QueryTest with SharedSQLContext {
   import testImplicits._
 
   /**
-   * Runs the plan and makes sure the plans contains all of the keywords.
+   * Get the explain from a DataFrame and run the specified action on it.
    */
-  private def checkKeywordsExistsInExplain(df: DataFrame, keywords: String*): Unit = {
+  private def withNormalizedExplain(df: DataFrame, extended: Boolean)(f: String => Unit) = {
     val output = new java.io.ByteArrayOutputStream()
     Console.withOut(output) {
-      df.explain(extended = true)
+      df.explain(extended = extended)
     }
     val normalizedOutput = output.toString.replaceAll("#\\d+", "#x")
-    for (key <- keywords) {
-      assert(normalizedOutput.contains(key))
+    f(normalizedOutput)
+  }
+
+  /**
+   * Runs the plan and makes sure the plans contains all of the keywords.
+   */
+  private def checkKeywordsExistsInExplain(df: DataFrame, keywords: String*): Unit = {
+    withNormalizedExplain(df, extended = true) { normalizedOutput =>
+      for (key <- keywords) {
+        assert(normalizedOutput.contains(key))
+      }
     }
   }
 
@@ -182,6 +191,15 @@ class ExplainSuite extends QueryTest with SharedSQLContext {
         "id#xL AS nullif(`id`, 'x')#xL, coalesce(cast(id#xL as string), x) AS nvl(`id`, 'x')#x, " +
         "x AS nvl2(`id`, 'x', 'y')#x]")
   }
+
+  test("SPARK-26659: explain of DataWritingCommandExec should not contain duplicate cmd.nodeName") {
+    withTable("temptable") {
+      val df = sql("create table temptable using parquet as select * from range(2)")
+      withNormalizedExplain(df, extended = false) { normalizedOutput =>
+        assert("Create\\w*?TableAsSelectCommand".r.findAllMatchIn(normalizedOutput).length == 1)
+      }
+    }
+  }
 }
 
 case class ExplainSingleData(id: Int)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org