You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2022/06/16 00:39:56 UTC
[spark] branch branch-3.2 updated: [SPARK-39061][SQL] Set nullable correctly for `Inline` output attributes

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.2
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.2 by this push:
     new f23a5441d49 [SPARK-39061][SQL] Set nullable correctly for `Inline` output attributes
f23a5441d49 is described below

commit f23a5441d49c74331102d59fb6ee71617163014d
Author: Bruce Robbins <be...@gmail.com>
AuthorDate: Thu Jun 16 09:38:50 2022 +0900

    [SPARK-39061][SQL] Set nullable correctly for `Inline` output attributes
    
    Change `Inline#elementSchema` to make each struct field nullable when the containing array has a null element.
    
    This query returns incorrect results (the last row should be `NULL NULL`):
    ```
    spark-sql> select inline(array(named_struct('a', 1, 'b', 2), null));
    1       2
    -1      -1
    Time taken: 4.053 seconds, Fetched 2 row(s)
    spark-sql>
    ```
    And this query gets a NullPointerException:
    ```
    spark-sql> select inline(array(named_struct('a', '1', 'b', '2'), null));
    22/04/28 16:51:54 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 2)
    java.lang.NullPointerException: null
            at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:110) ~[spark-catalyst_2.12-3.4.0-SNAPSHOT.jar:3.4.0-SNAPSHOT]
            at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.generate_doConsume_0$(Unknown Source) ~[?:?]
            at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) ~[?:?]
            at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(Buffere
    ```
    When an array of structs is created by `CreateArray`, and no struct field contains a literal null value, the schema for the struct will have non-nullable fields, even if the array itself has a null entry (as in the example above). As a result, the output attributes for the generator will be non-nullable.
    
    When the output attributes for `Inline` are non-nullable, `GenerateUnsafeProjection#writeExpressionsToBuffer` generates incorrect code for null structs.
    
    In more detail, the issue is this: `GenerateExec#codeGenCollection` generates code that will check if the struct instance (i.e., array element) is null and, if so, set a boolean for each struct field to indicate that the field contains a null. However, unless the generator's output attributes are nullable, `GenerateUnsafeProjection#writeExpressionsToBuffer` will not generate any code to check those booleans. Instead it will generate code to write out whatever is in the variables that  [...]
    
    Arrays of structs from file sources do not have this issue. In that case, each `StructField` will have nullable=true due to [this](https://github.com/apache/spark/blob/fe85d7912f86c3e337aa93b23bfa7e7e01c0a32e/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala#L417).
    
    (Note: the eval path for `Inline` has a different bug with null array elements that occurs even when `nullable` is set correctly in the schema, but I will address that in a separate PR).
    
    No.
    
    New unit test.
    
    Closes #36883 from bersprockets/inline_struct_nullability_issue.
    
    Authored-by: Bruce Robbins <be...@gmail.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
    (cherry picked from commit fc385dafabe3c609b38b81deaaf36e5eb6ee341b)
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 .../sql/catalyst/expressions/generators.scala      |  3 ++-
 .../apache/spark/sql/GeneratorFunctionSuite.scala  | 25 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
index 5214bff3543..1079f0a333d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/generators.scala
@@ -444,7 +444,8 @@ case class Inline(child: Expression) extends UnaryExpression with CollectionGene
   }
 
   override def elementSchema: StructType = child.dataType match {
-    case ArrayType(st: StructType, _) => st
+    case ArrayType(st: StructType, false) => st
+    case ArrayType(st: StructType, true) => st.asNullable
   }
 
   override def collectionType: DataType = child.dataType
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala
index a4f4f2ead78..d41e87f516f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/GeneratorFunctionSuite.scala
@@ -364,6 +364,31 @@ class GeneratorFunctionSuite extends QueryTest with SharedSparkSession {
       df.select(Stream(explode(array(min($"v"), max($"v"))), sum($"v")): _*),
       Row(1, 6) :: Row(3, 6) :: Nil)
   }
+
+  test("SPARK-39061: inline should handle null struct") {
+    val df = sql(
+      """select * from values
+        |(
+        |  1,
+        |  array(
+        |    named_struct('c1', 0, 'c2', 1),
+        |    null,
+        |    named_struct('c1', 2, 'c2', 3),
+        |    null
+        |  )
+        |)
+        |as tbl(a, b)
+         """.stripMargin)
+    df.createOrReplaceTempView("t1")
+
+    checkAnswer(
+      sql("select inline(b) from t1"),
+      Row(0, 1) :: Row(null, null) :: Row(2, 3) :: Row(null, null) :: Nil)
+
+    checkAnswer(
+      sql("select a, inline(b) from t1"),
+      Row(1, 0, 1) :: Row(1, null, null) :: Row(1, 2, 3) :: Row(1, null, null) :: Nil)
+  }
 }
 
 case class EmptyGenerator() extends Generator with LeafLike[Expression] {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org