You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by vi...@apache.org on 2021/07/06 07:10:35 UTC

[spark] branch branch-3.2 updated: [SPARK-35972][SQL] When replace ExtractValue in NestedColumnAliasing we should use semanticEquals

This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new c2ef235  [SPARK-35972][SQL] When replace ExtractValue in NestedColumnAliasing we should use semanticEquals
c2ef235 is described below

commit c2ef2354198ae156c3c442547669b6fd4f1c148a
Author: Angerszhuuuu <an...@gmail.com>
AuthorDate: Tue Jul 6 00:09:34 2021 -0700

    [SPARK-35972][SQL] When replace ExtractValue in NestedColumnAliasing we should use semanticEquals
    
    ### What changes were proposed in this pull request?
    Ideally, in SQL query, nested columns should result to GetStructField with non-None name. But there are places that can create GetStructField with None name, such as UnresolvedStar.expand, Dataset encoder stuff, etc.
    the current `nestedFieldToAlias` cannot catch it up and will cause job failed.
    
    ### Why are the changes needed?
    Fix bug
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    Added UT,
    
    Closes #33183 from AngersZhuuuu/SPARK-35972.
    
    Authored-by: Angerszhuuuu <an...@gmail.com>
    Signed-off-by: Liang-Chi Hsieh <vi...@gmail.com>
    (cherry picked from commit 87282f04bf8810882187f6759d4a675190ca9b0b)
    Signed-off-by: Liang-Chi Hsieh <vi...@gmail.com>
---
 .../catalyst/optimizer/NestedColumnAliasing.scala  | 15 +++++++------
 .../optimizer/NestedColumnAliasingSuite.scala      | 25 ++++++++++++++++++++++
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
index 9ea3c1d..9facae3b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
@@ -144,7 +144,8 @@ object NestedColumnAliasing {
         attr -> evAliasSeq
       }
 
-    val nestedFieldToAlias = attributeToExtractValuesAndAliases.values.flatten.toMap
+    val nestedFieldToAlias = attributeToExtractValuesAndAliases.values.flatten
+      .map { case (field, alias) => field.canonicalized -> alias }.toMap
 
     // A reference attribute can have multiple aliases for nested fields.
     val attrToAliases =
@@ -167,10 +168,10 @@ object NestedColumnAliasing {
    */
   def getNewProjectList(
       projectList: Seq[NamedExpression],
-      nestedFieldToAlias: Map[ExtractValue, Alias]): Seq[NamedExpression] = {
+      nestedFieldToAlias: Map[Expression, Alias]): Seq[NamedExpression] = {
     projectList.map(_.transform {
-      case f: ExtractValue if nestedFieldToAlias.contains(f) =>
-        nestedFieldToAlias(f).toAttribute
+      case f: ExtractValue if nestedFieldToAlias.contains(f.canonicalized) =>
+        nestedFieldToAlias(f.canonicalized).toAttribute
     }.asInstanceOf[NamedExpression])
   }
 
@@ -180,13 +181,13 @@ object NestedColumnAliasing {
    */
   def replaceWithAliases(
       plan: LogicalPlan,
-      nestedFieldToAlias: Map[ExtractValue, Alias],
+      nestedFieldToAlias: Map[Expression, Alias],
       attrToAliases: AttributeMap[Seq[Alias]]): LogicalPlan = {
     plan.withNewChildren(plan.children.map { plan =>
       Project(plan.output.flatMap(a => attrToAliases.getOrElse(a, Seq(a))), plan)
     }).transformExpressions {
-      case f: ExtractValue if nestedFieldToAlias.contains(f) =>
-        nestedFieldToAlias(f).toAttribute
+      case f: ExtractValue if nestedFieldToAlias.contains(f.canonicalized) =>
+        nestedFieldToAlias(f.canonicalized).toAttribute
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
index 19aec2a..e49e028 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
@@ -738,6 +738,31 @@ class NestedColumnAliasingSuite extends SchemaPruningTest {
     val optimized = Optimize.execute(query)
     comparePlans(optimized, query)
   }
+
+  test("SPARK-35972: NestedColumnAliasing should consider semantic equality") {
+    val dataType = new StructType()
+      .add(StructField("itemid", StringType))
+      .add(StructField("search_params", StructType(Seq(
+        StructField("col1", StringType),
+        StructField("col2", StringType)
+      ))))
+    val relation = LocalRelation('struct_data.struct(dataType))
+    val plan = relation
+      .repartition(100)
+      .select(
+        GetStructField('struct_data, 1, None).as("value"),
+        $"struct_data.search_params.col1".as("col1"),
+        $"struct_data.search_params.col2".as("col2")).analyze
+    val query = Optimize.execute(plan)
+    val optimized = relation
+      .select(GetStructField('struct_data, 1, None).as("_extract_search_params"))
+      .repartition(100)
+      .select(
+        $"_extract_search_params".as("value"),
+        $"_extract_search_params.col1".as("col1"),
+        $"_extract_search_params.col2".as("col2")).analyze
+    comparePlans(optimized, query)
+  }
 }
 
 object NestedColumnAliasingSuite {

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org