You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by vi...@apache.org on 2022/03/07 20:05:58 UTC

[spark] branch master updated: [SPARK-38285][SQL] Avoid generator pruning for invalid extractor

This is an automated email from the ASF dual-hosted git repository.

viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 71991f7  [SPARK-38285][SQL] Avoid generator pruning for invalid extractor
71991f7 is described below

commit 71991f75ff441e80a52cb71f66f46bfebdb05671
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Mon Mar 7 12:04:24 2022 -0800

    [SPARK-38285][SQL] Avoid generator pruning for invalid extractor
    
    ### What changes were proposed in this pull request?
    
    This fixes a bug in generator nested column pruning. The bug happens when the extractor pattern is like `GetArrayStructFields(GetStructField(...), ...)` on the generator output. Once the input to the generator is an array, after replacing with the extractor based on pruning logic, it becomes an extractor of `GetArrayStructFields(GetArrayStructFields(...), ...)` which is not valid.
    
    ### Why are the changes needed?
    
    To fix a bug in generator nested column pruning.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, fixing a user-facing bug.
    
    ### How was this patch tested?
    
    Added unit test.
    
    Closes #35749 from viirya/SPARK-38285.
    
    Authored-by: Liang-Chi Hsieh <vi...@gmail.com>
    Signed-off-by: Liang-Chi Hsieh <vi...@gmail.com>
---
 .../catalyst/optimizer/NestedColumnAliasing.scala    | 11 +++++++++++
 .../scala/org/apache/spark/sql/DataFrameSuite.scala  | 20 ++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
index c8c67f5..a2ee950 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
@@ -372,6 +372,17 @@ object GeneratorNestedColumnAliasing {
                 e.withNewChildren(Seq(extractor))
             }
 
+            // If after replacing generator expression with nested extractor, there
+            // is invalid extractor pattern like
+            // `GetArrayStructFields(GetArrayStructFields(...), ...), we cannot do
+            // pruning but fallback to original query plan.
+            val invalidExtractor = rewrittenG.generator.children.head.collect {
+              case GetArrayStructFields(_: GetArrayStructFields, _, _, _, _) => true
+            }
+            if (invalidExtractor.nonEmpty) {
+              return Some(pushedThrough)
+            }
+
             // As we change the child of the generator, its output data type must be updated.
             val updatedGeneratorOutput = rewrittenG.generatorOutput
               .zip(rewrittenG.generator.elementSchema.toAttributes)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index c7d05df..3eb9764 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -3107,6 +3107,26 @@ class DataFrameSuite extends QueryTest
 
     assert(res.collect.length == 2)
   }
+
+  test("SPARK-38285: Fix ClassCastException: GenericArrayData cannot be cast to InternalRow") {
+    withTempView("v1") {
+      val sqlText =
+        """
+          |CREATE OR REPLACE TEMP VIEW v1 AS
+          |SELECT * FROM VALUES
+          |(array(
+          |  named_struct('s', 'string1', 'b', array(named_struct('e', 'string2'))),
+          |  named_struct('s', 'string4', 'b', array(named_struct('e', 'string5')))
+          |  )
+          |)
+          |v1(o);
+          |""".stripMargin
+      sql(sqlText)
+
+      val df = sql("SELECT eo.b.e FROM (SELECT explode(o) AS eo FROM v1)")
+      checkAnswer(df, Row(Seq("string2")) :: Row(Seq("string5")) :: Nil)
+    }
+  }
 }
 
 case class GroupByKey(a: Int, b: Int)

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org