You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2024/01/12 02:21:04 UTC
(spark) branch master updated: [SPARK-46640][SQL] Fix RemoveRedundantAlias by excluding subquery attributes

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new bbeb8d7417ba [SPARK-46640][SQL] Fix RemoveRedundantAlias by excluding subquery attributes
bbeb8d7417ba is described below

commit bbeb8d7417bafa09ad5202347175a47b3217e27f
Author: Nikhil Sheoran <12...@users.noreply.github.com>
AuthorDate: Fri Jan 12 10:20:49 2024 +0800

    [SPARK-46640][SQL] Fix RemoveRedundantAlias by excluding subquery attributes
    
    ### What changes were proposed in this pull request?
    - In `RemoveRedundantAliases`, we have an `excluded` AttributeSet argument denoting the references for which we should not remove aliases. For a query with subquery expressions, adding the attributes references by the subquery in the `excluded` set prevents rewrites that might remove presumedly redundant aliases. (Changes in RemoveRedundantAlias)
    - Added a configuration flag to disable this fix, if not needed.
    - Added a unit test with Filter exists subquery expression to show how the alias would have been removed.
    
    ### Why are the changes needed?
    - `RemoveRedundantAliases` does not take into account the outer attributes of a `SubqueryExpression` when considering redundant aliases, potentially removing them if it thinks they are redundant.
    - This can cause scenarios where a subquery expression has conditions like `a#x = a#x` i.e. both the attribute names and the expression ID(s) are the same. This can then lead to conflicting expression ID(s) error.
    - For example, in the query example below, the `RemoveRedundantAliases` would remove the alias `a#0 as a#1` and replace `a#1` with `a#0` in the Filter exists subquery expression which would create an issue if the subquery expression had an attribute with reference `a#0` (possible due to different scan relation instances possibly having the same attribute ID(s) (Ref: #40662)
    ```
    Filter exists [a#1 && (a#1 = b#2)]
    :  +- LocalRelation <empty>, [b#2]
      +- Project [a#0 AS a#1]
      +- LocalRelation <empty>, [a#0]
    ```
    becomes
    ```
    Filter exists [a#0 && (a#0 = b#2)]
    :  +- LocalRelation <empty>, [b#2]
      +- LocalRelation <empty>, [a#0]
    ```
    - The changes are needed to fix this bug.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    - Added a unit test with Filter exists subquery expression to show how the alias would have been removed.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #44645 from nikhilsheoran-db/SPARK-46640.
    
    Authored-by: Nikhil Sheoran <12...@users.noreply.github.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../spark/sql/catalyst/optimizer/Optimizer.scala   | 12 +++++-
 .../org/apache/spark/sql/internal/SQLConf.scala    |  9 ++++
 .../RemoveRedundantAliasAndProjectSuite.scala      | 48 ++++++++++++++++++++++
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 61791b35df85..8fcc7c7c26b4 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -584,10 +584,20 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
         }
 
       case _ =>
+        val subQueryAttributes = if (conf.getConf(SQLConf
+          .EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES)) {
+          // Collect the references for all the subquery expressions in the plan.
+          AttributeSet.fromAttributeSets(plan.expressions.collect {
+            case e: SubqueryExpression => e.references
+          })
+        } else {
+          AttributeSet.empty
+        }
+
         // Remove redundant aliases in the subtree(s).
         val currentNextAttrPairs = mutable.Buffer.empty[(Attribute, Attribute)]
         val newNode = plan.mapChildren { child =>
-          val newChild = removeRedundantAliases(child, excluded)
+          val newChild = removeRedundantAliases(child, excluded ++ subQueryAttributes)
           currentNextAttrPairs ++= createAttributeMapping(child, newChild)
           newChild
         }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 1928e74363cb..743a2e20c885 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -4513,6 +4513,15 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES =
+    buildConf("spark.sql.optimizer.excludeSubqueryRefsFromRemoveRedundantAliases.enabled")
+      .internal()
+      .doc("When true, exclude the references from the subquery expressions (in, exists, etc.) " +
+        s"while removing redundant aliases.")
+      .version("4.0.0")
+      .booleanConf
+      .createWithDefault(true)
+
   val TIME_TRAVEL_TIMESTAMP_KEY =
     buildConf("spark.sql.timeTravelTimestampKey")
       .doc("The option name to specify the time travel timestamp when reading a table.")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala
index cd19e5062ae1..8a0a0466ca74 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAliasAndProjectSuite.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.MetadataBuilder
 
 class RemoveRedundantAliasAndProjectSuite extends PlanTest {
@@ -130,4 +131,51 @@ class RemoveRedundantAliasAndProjectSuite extends PlanTest {
       correlated = false)
     comparePlans(optimized, expected)
   }
+
+  test("SPARK-46640: do not remove outer references from a subquery expression") {
+    val a = $"a".int
+    val a_alias = Alias(a, "a")()
+    val a_alias_attr = a_alias.toAttribute
+    val b = $"b".int
+
+    // The original input query
+    //  Filter exists [a#1 && (a#1 = b#2)]
+    //  :  +- LocalRelation <empty>, [b#2]
+    //    +- Project [a#0 AS a#1]
+    //    +- LocalRelation <empty>, [a#0]
+    val query = Filter(
+      Exists(
+        LocalRelation(b),
+        outerAttrs = Seq(a_alias_attr),
+        joinCond = Seq(EqualTo(a_alias_attr, b))
+      ),
+      Project(Seq(a_alias), LocalRelation(a))
+    )
+
+    // The alias would not be removed if excluding subquery references is enabled.
+    val expectedWhenExcluded = query
+
+    // The alias would have been removed if excluding subquery references is disabled.
+    //  Filter exists [a#0 && (a#0 = b#2)]
+    //  :  +- LocalRelation <empty>, [b#2]
+    //    +- LocalRelation <empty>, [a#0]
+    val expectedWhenNotExcluded = Filter(
+      Exists(
+        LocalRelation(b),
+        outerAttrs = Seq(a),
+        joinCond = Seq(EqualTo(a, b))
+      ),
+      LocalRelation(a)
+    )
+
+    withSQLConf(SQLConf.EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES.key -> "true") {
+      val optimized = Optimize.execute(query)
+      comparePlans(optimized, expectedWhenExcluded)
+    }
+
+    withSQLConf(SQLConf.EXCLUDE_SUBQUERY_EXP_REFS_FROM_REMOVE_REDUNDANT_ALIASES.key -> "false") {
+      val optimized = Optimize.execute(query)
+      comparePlans(optimized, expectedWhenNotExcluded)
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org