You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by li...@apache.org on 2019/01/18 06:43:54 UTC
[spark] branch master updated: [SPARK-26659][SQL] Fix duplicate
cmd.nodeName in the explain output of DataWritingCommandExec
This is an automated email from the ASF dual-hosted git repository.
lixiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new e341864 [SPARK-26659][SQL] Fix duplicate cmd.nodeName in the explain output of DataWritingCommandExec
e341864 is described below
commit e3418649dcb50f2a2fb977560d87a94c81516198
Author: Kris Mok <kr...@databricks.com>
AuthorDate: Thu Jan 17 22:43:39 2019 -0800
[SPARK-26659][SQL] Fix duplicate cmd.nodeName in the explain output of DataWritingCommandExec
## What changes were proposed in this pull request?
`DataWritingCommandExec` generates `cmd.nodeName` twice in its explain output, e.g. when running this query `spark.sql("create table foo stored as parquet as select id, id % 10 as cat1, id % 20 as cat2 from range(10)")`,
```
Execute OptimizedCreateHiveTableAsSelectCommand OptimizedCreateHiveTableAsSelectCommand [Database:default, TableName: foo, InsertIntoHiveTable]
+- *(1) Project [id#2L, (id#2L % 10) AS cat1#0L, (id#2L % 20) AS cat2#1L]
+- *(1) Range (0, 10, step=1, splits=8)
```
After the fix, it'll go back to normal:
```
Execute OptimizedCreateHiveTableAsSelectCommand [Database:default, TableName: foo, InsertIntoHiveTable]
+- *(1) Project [id#2L, (id#2L % 10) AS cat1#0L, (id#2L % 20) AS cat2#1L]
+- *(1) Range (0, 10, step=1, splits=8)
```
This duplication is introduced when this specialized `DataWritingCommandExec` was created in place of `ExecutedCommandExec`.
The former is a `UnaryExecNode` whose `children` include the physical plan of the query, and the `cmd` is picked up via `TreeNode.stringArgs` into the argument string. The duplication comes from: `DataWritingCommandExec.nodeName` is `s"Execute ${cmd.nodeName}"` while the argument string is `cmd.simpleString()` which also includes `cmd.nodeName`.
The latter didn't have that problem because it's a `LeafExecNode` with no children, and it declares the `cmd` as being a part of the `innerChildren` which is excluded from the argument string.
## How was this patch tested?
Manual testing of running the example above in a local Spark Shell.
Also added a new test case in `ExplainSuite`.
Closes #23579 from rednaxelafx/fix-explain.
Authored-by: Kris Mok <kr...@databricks.com>
Signed-off-by: gatorsmile <ga...@gmail.com>
---
.../spark/sql/execution/command/commands.scala | 3 +++
.../scala/org/apache/spark/sql/ExplainSuite.scala | 28 ++++++++++++++++++----
2 files changed, 26 insertions(+), 5 deletions(-)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
index 754a331..a1f2785 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala
@@ -110,6 +110,9 @@ case class DataWritingCommandExec(cmd: DataWritingCommand, child: SparkPlan)
override def nodeName: String = "Execute " + cmd.nodeName
+ // override the default one, otherwise the `cmd.nodeName` will appear twice from simpleString
+ override def argString(maxFields: Int): String = cmd.argString(maxFields)
+
override def executeCollect(): Array[InternalRow] = sideEffectResult.toArray
override def executeToIterator: Iterator[InternalRow] = sideEffectResult.toIterator
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
index ce47592..ec68828 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
@@ -25,16 +25,25 @@ class ExplainSuite extends QueryTest with SharedSQLContext {
import testImplicits._
/**
- * Runs the plan and makes sure the plans contains all of the keywords.
+ * Get the explain from a DataFrame and run the specified action on it.
*/
- private def checkKeywordsExistsInExplain(df: DataFrame, keywords: String*): Unit = {
+ private def withNormalizedExplain(df: DataFrame, extended: Boolean)(f: String => Unit) = {
val output = new java.io.ByteArrayOutputStream()
Console.withOut(output) {
- df.explain(extended = true)
+ df.explain(extended = extended)
}
val normalizedOutput = output.toString.replaceAll("#\\d+", "#x")
- for (key <- keywords) {
- assert(normalizedOutput.contains(key))
+ f(normalizedOutput)
+ }
+
+ /**
+ * Runs the plan and makes sure the plans contains all of the keywords.
+ */
+ private def checkKeywordsExistsInExplain(df: DataFrame, keywords: String*): Unit = {
+ withNormalizedExplain(df, extended = true) { normalizedOutput =>
+ for (key <- keywords) {
+ assert(normalizedOutput.contains(key))
+ }
}
}
@@ -182,6 +191,15 @@ class ExplainSuite extends QueryTest with SharedSQLContext {
"id#xL AS nullif(`id`, 'x')#xL, coalesce(cast(id#xL as string), x) AS nvl(`id`, 'x')#x, " +
"x AS nvl2(`id`, 'x', 'y')#x]")
}
+
+ test("SPARK-26659: explain of DataWritingCommandExec should not contain duplicate cmd.nodeName") {
+ withTable("temptable") {
+ val df = sql("create table temptable using parquet as select * from range(2)")
+ withNormalizedExplain(df, extended = false) { normalizedOutput =>
+ assert("Create\\w*?TableAsSelectCommand".r.findAllMatchIn(normalizedOutput).length == 1)
+ }
+ }
+ }
}
case class ExplainSingleData(id: Int)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org