You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2022/06/06 12:59:13 UTC
[spark] branch master updated: [SPARK-39376][SQL] Hide duplicated columns in star expansion of subquery alias from NATURAL/USING JOIN
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 18ca369f019 [SPARK-39376][SQL] Hide duplicated columns in star expansion of subquery alias from NATURAL/USING JOIN
18ca369f019 is described below
commit 18ca369f01905b421a658144e23b5a4e60702655
Author: Karen Feng <ka...@databricks.com>
AuthorDate: Mon Jun 6 20:58:23 2022 +0800
[SPARK-39376][SQL] Hide duplicated columns in star expansion of subquery alias from NATURAL/USING JOIN
### What changes were proposed in this pull request?
Follows up from https://github.com/apache/spark/pull/31666. This PR introduced a bug where the qualified star expansion of a subquery alias containing a NATURAL/USING output duplicated columns.
### Why are the changes needed?
Duplicated, hidden columns should not be output from a star expansion.
### Does this PR introduce _any_ user-facing change?
The query
```
val df1 = Seq((3, 8)).toDF("a", "b")
val df2 = Seq((8, 7)).toDF("b", "d")
val joinDF = df1.join(df2, "b")
joinDF.alias("r").select("r.*")
```
Now outputs a single column `b`, instead of two (duplicate) columns for `b`.
### How was this patch tested?
UTs
Closes #36763 from karenfeng/SPARK-39376.
Authored-by: Karen Feng <ka...@databricks.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
.../plans/logical/basicLogicalOperators.scala | 3 ++-
.../org/apache/spark/sql/DataFrameJoinSuite.scala | 22 ++++++++++++++++++++++
2 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index b4c6e19d0bc..677bdf27336 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -1369,7 +1369,8 @@ case class SubqueryAlias(
override def metadataOutput: Seq[Attribute] = {
val qualifierList = identifier.qualifier :+ alias
- child.metadataOutput.map(_.withQualifier(qualifierList))
+ val nonHiddenMetadataOutput = child.metadataOutput.filter(!_.supportsQualifiedStar)
+ nonHiddenMetadataOutput.map(_.withQualifier(qualifierList))
}
override def maxRows: Option[Long] = child.maxRows
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
index 5286a70674e..de900fffb34 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -554,4 +554,26 @@ class DataFrameJoinSuite extends QueryTest
)
}
}
+
+ test("SPARK-39376: Hide duplicated columns in star expansion of subquery alias from USING JOIN") {
+ val joinDf = testData2.as("testData2").join(
+ testData3.as("testData3"), usingColumns = Seq("a"), joinType = "fullouter")
+ val equivalentQueries = Seq(
+ joinDf.select($"*"),
+ joinDf.as("r").select($"*"),
+ joinDf.as("r").select($"r.*")
+ )
+ equivalentQueries.foreach { query =>
+ checkAnswer(query,
+ Seq(
+ Row(1, 1, null),
+ Row(1, 2, null),
+ Row(2, 1, 2),
+ Row(2, 2, 2),
+ Row(3, 1, null),
+ Row(3, 2, null)
+ )
+ )
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org