You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2024/01/12 02:23:24 UTC
(spark) branch branch-3.5 updated: [SPARK-46640][SQL] Fix RemoveRedundantAlias by excluding subquery attributes
This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push:
new 8a0f64274f44 [SPARK-46640][SQL] Fix RemoveRedundantAlias by excluding subquery attributes
8a0f64274f44 is described below
commit 8a0f64274f44dd17a3e1f034c9f1f20a61ff0549
Author: Nikhil Sheoran <12...@users.noreply.github.com>
AuthorDate: Fri Jan 12 10:20:49 2024 +0800
[SPARK-46640][SQL] Fix RemoveRedundantAlias by excluding subquery attributes
- In `RemoveRedundantAliases`, we have an `excluded` AttributeSet argument denoting the references for which we should not remove aliases. For a query with subquery expressions, adding the attributes references by the subquery in the `excluded` set prevents rewrites that might remove presumedly redundant aliases. (Changes in RemoveRedundantAlias)
- Added a configuration flag to disable this fix, if not needed.
- Added a unit test with Filter exists subquery expression to show how the alias would have been removed.
- `RemoveRedundantAliases` does not take into account the outer attributes of a `SubqueryExpression` when considering redundant aliases, potentially removing them if it thinks they are redundant.
- This can cause scenarios where a subquery expression has conditions like `a#x = a#x` i.e. both the attribute names and the expression ID(s) are the same. This can then lead to conflicting expression ID(s) error.
- For example, in the query example below, the `RemoveRedundantAliases` would remove the alias `a#0 as a#1` and replace `a#1` with `a#0` in the Filter exists subquery expression which would create an issue if the subquery expression had an attribute with reference `a#0` (possible due to different scan relation instances possibly having the same attribute ID(s) (Ref: #40662)
```
Filter exists [a#1 && (a#1 = b#2)]
: +- LocalRelation <empty>, [b#2]
+- Project [a#0 AS a#1]
+- LocalRelation <empty>, [a#0]
```
becomes
```
Filter exists [a#0 && (a#0 = b#2)]
: +- LocalRelation <empty>, [b#2]
+- LocalRelation <empty>, [a#0]
```
- The changes are needed to fix this bug.
No
- Added a unit test with Filter exists subquery expression to show how the alias would have been removed.
No
Closes #44645 from nikhilsheoran-db/SPARK-46640.
Authored-by: Nikhil Sheoran <12...@users.noreply.github.com>
Signed-off-by: Wenchen Fan <we...@databricks.com>
(cherry picked from commit bbeb8d7417bafa09ad5202347175a47b3217e27f)
Signed-off-by: Wenchen Fan <we...@databricks.com>
---
.../spark/sql/catalyst/optimizer/Optimizer.scala | 12 +++++-
.../org/apache/spark/sql/internal/SQLConf.scala | 9 ++++
.../RemoveRedundantAliasAndProjectSuite.scala | 48 ++++++++++++++++++++++
3 files changed, 68 insertions(+), 1 deletion(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index ec5f00d34cd8..df17840d567e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -576,10 +576,20 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
}
case _ =>
+ val subQueryAttributes = if (conf.getConf(SQLConf
+ .EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES)) {
+ // Collect the references for all the subquery expressions in the plan.
+ AttributeSet.fromAttributeSets(plan.expressions.collect {
+ case e: SubqueryExpression => e.references
+ })
+ } else {
+ AttributeSet.empty
+ }
+
// Remove redundant aliases in the subtree(s).
val currentNextAttrPairs = mutable.Buffer.empty[(Attribute, Attribute)]
val newNode = plan.mapChildren { child =>
- val newChild = removeRedundantAliases(child, excluded)
+ val newChild = removeRedundantAliases(child, excluded ++ subQueryAttributes)
currentNextAttrPairs ++= createAttributeMapping(child, newChild)
newChild
}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index de4a89667aff..2e41374035c8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -4352,6 +4352,15 @@ object SQLConf {
.checkValue(_ >= 0, "The threshold of cached local relations must not be negative")
.createWithDefault(64 * 1024 * 1024)
+ val EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES =
+ buildConf("spark.sql.optimizer.excludeSubqueryRefsFromRemoveRedundantAliases.enabled")
+ .internal()
+ .doc("When true, exclude the references from the subquery expressions (in, exists, etc.) " +
+ s"while removing redundant aliases.")
+ .version("4.0.0")
+ .booleanConf
+ .createWithDefault(true)
+
val LEGACY_PERCENTILE_DISC_CALCULATION = buildConf("spark.sql.legacy.percentileDiscCalculation")
.internal()
.doc("If true, the old bogus percentile_disc calculation is used. The old calculation " +
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala
index cd19e5062ae1..8a0a0466ca74 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.MetadataBuilder
class RemoveRedundantAliasAndProjectSuite extends PlanTest {
@@ -130,4 +131,51 @@ class RemoveRedundantAliasAndProjectSuite extends PlanTest {
correlated = false)
comparePlans(optimized, expected)
}
+
+ test("SPARK-46640: do not remove outer references from a subquery expression") {
+ val a = $"a".int
+ val a_alias = Alias(a, "a")()
+ val a_alias_attr = a_alias.toAttribute
+ val b = $"b".int
+
+ // The original input query
+ // Filter exists [a#1 && (a#1 = b#2)]
+ // : +- LocalRelation <empty>, [b#2]
+ // +- Project [a#0 AS a#1]
+ // +- LocalRelation <empty>, [a#0]
+ val query = Filter(
+ Exists(
+ LocalRelation(b),
+ outerAttrs = Seq(a_alias_attr),
+ joinCond = Seq(EqualTo(a_alias_attr, b))
+ ),
+ Project(Seq(a_alias), LocalRelation(a))
+ )
+
+ // The alias would not be removed if excluding subquery references is enabled.
+ val expectedWhenExcluded = query
+
+ // The alias would have been removed if excluding subquery references is disabled.
+ // Filter exists [a#0 && (a#0 = b#2)]
+ // : +- LocalRelation <empty>, [b#2]
+ // +- LocalRelation <empty>, [a#0]
+ val expectedWhenNotExcluded = Filter(
+ Exists(
+ LocalRelation(b),
+ outerAttrs = Seq(a),
+ joinCond = Seq(EqualTo(a, b))
+ ),
+ LocalRelation(a)
+ )
+
+ withSQLConf(SQLConf.EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES.key -> "true") {
+ val optimized = Optimize.execute(query)
+ comparePlans(optimized, expectedWhenExcluded)
+ }
+
+ withSQLConf(SQLConf.EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES.key -> "false") {
+ val optimized = Optimize.execute(query)
+ comparePlans(optimized, expectedWhenNotExcluded)
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org