You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by vi...@apache.org on 2021/07/06 07:10:04 UTC
[spark] branch master updated: [SPARK-35972][SQL] When replace
ExtractValue in NestedColumnAliasing we should use semanticEquals
This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 87282f0 [SPARK-35972][SQL] When replace ExtractValue in NestedColumnAliasing we should use semanticEquals
87282f0 is described below
commit 87282f04bf8810882187f6759d4a675190ca9b0b
Author: Angerszhuuuu <an...@gmail.com>
AuthorDate: Tue Jul 6 00:09:34 2021 -0700
[SPARK-35972][SQL] When replace ExtractValue in NestedColumnAliasing we should use semanticEquals
### What changes were proposed in this pull request?
Ideally, in SQL query, nested columns should result to GetStructField with non-None name. But there are places that can create GetStructField with None name, such as UnresolvedStar.expand, Dataset encoder stuff, etc.
the current `nestedFieldToAlias` cannot catch it up and will cause job failed.
### Why are the changes needed?
Fix bug
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
Added UT,
Closes #33183 from AngersZhuuuu/SPARK-35972.
Authored-by: Angerszhuuuu <an...@gmail.com>
Signed-off-by: Liang-Chi Hsieh <vi...@gmail.com>
---
.../catalyst/optimizer/NestedColumnAliasing.scala | 15 +++++++------
.../optimizer/NestedColumnAliasingSuite.scala | 25 ++++++++++++++++++++++
2 files changed, 33 insertions(+), 7 deletions(-)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
index 9ea3c1d..9facae3b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
@@ -144,7 +144,8 @@ object NestedColumnAliasing {
attr -> evAliasSeq
}
- val nestedFieldToAlias = attributeToExtractValuesAndAliases.values.flatten.toMap
+ val nestedFieldToAlias = attributeToExtractValuesAndAliases.values.flatten
+ .map { case (field, alias) => field.canonicalized -> alias }.toMap
// A reference attribute can have multiple aliases for nested fields.
val attrToAliases =
@@ -167,10 +168,10 @@ object NestedColumnAliasing {
*/
def getNewProjectList(
projectList: Seq[NamedExpression],
- nestedFieldToAlias: Map[ExtractValue, Alias]): Seq[NamedExpression] = {
+ nestedFieldToAlias: Map[Expression, Alias]): Seq[NamedExpression] = {
projectList.map(_.transform {
- case f: ExtractValue if nestedFieldToAlias.contains(f) =>
- nestedFieldToAlias(f).toAttribute
+ case f: ExtractValue if nestedFieldToAlias.contains(f.canonicalized) =>
+ nestedFieldToAlias(f.canonicalized).toAttribute
}.asInstanceOf[NamedExpression])
}
@@ -180,13 +181,13 @@ object NestedColumnAliasing {
*/
def replaceWithAliases(
plan: LogicalPlan,
- nestedFieldToAlias: Map[ExtractValue, Alias],
+ nestedFieldToAlias: Map[Expression, Alias],
attrToAliases: AttributeMap[Seq[Alias]]): LogicalPlan = {
plan.withNewChildren(plan.children.map { plan =>
Project(plan.output.flatMap(a => attrToAliases.getOrElse(a, Seq(a))), plan)
}).transformExpressions {
- case f: ExtractValue if nestedFieldToAlias.contains(f) =>
- nestedFieldToAlias(f).toAttribute
+ case f: ExtractValue if nestedFieldToAlias.contains(f.canonicalized) =>
+ nestedFieldToAlias(f.canonicalized).toAttribute
}
}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
index 19aec2a..e49e028 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
@@ -738,6 +738,31 @@ class NestedColumnAliasingSuite extends SchemaPruningTest {
val optimized = Optimize.execute(query)
comparePlans(optimized, query)
}
+
+ test("SPARK-35972: NestedColumnAliasing should consider semantic equality") {
+ val dataType = new StructType()
+ .add(StructField("itemid", StringType))
+ .add(StructField("search_params", StructType(Seq(
+ StructField("col1", StringType),
+ StructField("col2", StringType)
+ ))))
+ val relation = LocalRelation('struct_data.struct(dataType))
+ val plan = relation
+ .repartition(100)
+ .select(
+ GetStructField('struct_data, 1, None).as("value"),
+ $"struct_data.search_params.col1".as("col1"),
+ $"struct_data.search_params.col2".as("col2")).analyze
+ val query = Optimize.execute(plan)
+ val optimized = relation
+ .select(GetStructField('struct_data, 1, None).as("_extract_search_params"))
+ .repartition(100)
+ .select(
+ $"_extract_search_params".as("value"),
+ $"_extract_search_params.col1".as("col1"),
+ $"_extract_search_params.col2".as("col2")).analyze
+ comparePlans(optimized, query)
+ }
}
object NestedColumnAliasingSuite {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org