You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2023/02/13 06:50:02 UTC
[spark] branch master updated: [SPARK-42401][SQL] Set `containsNull` correctly in the data type for array_insert/array_append

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 718b6b7ed82 [SPARK-42401][SQL] Set `containsNull` correctly in the data type for array_insert/array_append
718b6b7ed82 is described below

commit 718b6b7ed8277d5f6577367ab0d49f60f9777df7
Author: Bruce Robbins <be...@gmail.com>
AuthorDate: Mon Feb 13 15:49:46 2023 +0900

    [SPARK-42401][SQL] Set `containsNull` correctly in the data type for array_insert/array_append
    
    ### What changes were proposed in this pull request?
    
    In the `DataType` instance returned by `ArrayInsert#dataType` and `ArrayAppend#dataType`, set `containsNull` to true if either
    
    - the input array has `containsNull` set to true
    - the expression to be inserted/appended is nullable.
    
    ### Why are the changes needed?
    
    The following two queries return the wrong answer:
    ```
    spark-sql> select array_insert(array(1, 2, 3, 4), 5, cast(null as int));
    [1,2,3,4,0] <== should be [1,2,3,4,null]
    Time taken: 3.879 seconds, Fetched 1 row(s)
    spark-sql> select array_append(array(1, 2, 3, 4), cast(null as int));
    [1,2,3,4,0] <== should be [1,2,3,4,null]
    Time taken: 0.068 seconds, Fetched 1 row(s)
    spark-sql>
    ```
    The following two queries throw a `NullPointerException`:
    ```
    spark-sql> select array_insert(array('1', '2', '3', '4'), 5, cast(null as string));
    23/02/10 11:24:59 ERROR Executor: Exception in task 0.0 in stage 2.0 (TID 2)
    java.lang.NullPointerException
            at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:110)
            at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
            at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
    ...
    spark-sql> select array_append(array('1', '2', '3', '4'), cast(null as string));
    23/02/10 11:25:10 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 3)
    java.lang.NullPointerException
            at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:110)
            at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
            at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
    ...
    spark-sql>
    ```
    The bug arises because both `ArrayInsert` and `ArrayAppend` use the first child's data type as the function's data type. That is, it uses the first child's `containsNull` setting, regardless of whether the insert/append operation might produce an array containing a null value.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    New unit tests.
    
    Closes #39970 from bersprockets/array_insert_anomaly.
    
    Authored-by: Bruce Robbins <be...@gmail.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 .../sql/catalyst/expressions/collectionOperations.scala    |  4 ++--
 .../catalyst/expressions/CollectionExpressionsSuite.scala  | 14 ++++++++++++++
 .../org/apache/spark/sql/DataFrameFunctionsSuite.scala     | 14 ++++++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 92a3127d438..53d8ff160c0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -4840,7 +4840,7 @@ case class ArrayInsert(srcArrayExpr: Expression, posExpr: Expression, itemExpr:
   override def third: Expression = itemExpr
 
   override def prettyName: String = "array_insert"
-  override def dataType: DataType = first.dataType
+  override def dataType: DataType = if (third.nullable) first.dataType.asNullable else first.dataType
   override def nullable: Boolean = first.nullable | second.nullable
 
   @transient private lazy val elementType: DataType =
@@ -5024,7 +5024,7 @@ case class ArrayAppend(left: Expression, right: Expression)
    * Returns the [[DataType]] of the result of evaluating this expression. It is invalid to query
    * the dataType of an unresolved expression (i.e., when `resolved` == false).
    */
-  override def dataType: DataType = left.dataType
+  override def dataType: DataType = if (right.nullable) left.dataType.asNullable else left.dataType
   protected def withNewChildrenInternal(newLeft: Expression, newRight: Expression): ArrayAppend =
     copy(left = newLeft, right = newRight)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 5917d84df1e..64b9c18605d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -2752,4 +2752,18 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
     )
 
   }
+
+  test("SPARK-42401: Array insert of null value") {
+    val a = Literal.create(Seq("b", "a", "c"), ArrayType(StringType, false))
+    checkEvaluation(ArrayInsert(
+      a, Literal(2), Literal.create(null, StringType)), Seq("b", null, "a", "c")
+    )
+  }
+
+  test("SPARK-42401: Array append of null value") {
+    val a = Literal.create(Seq("b", "a", "c"), ArrayType(StringType, false))
+    checkEvaluation(ArrayAppend(
+      a, Literal.create(null, StringType)), Seq("b", "a", "c", null)
+    )
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 6ed8299976c..94f813a2c6b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -5431,6 +5431,20 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
       Seq(Row(Seq(1, 2, 3, null, null)))
     )
   }
+
+  test("SPARK-42401: array_insert - insert null") {
+    checkAnswer(
+      sql("select array_insert(array('b', 'a', 'c'), 2, cast(null as string))"),
+      Seq(Row(Seq("b", null, "a", "c")))
+    )
+  }
+
+  test("SPARK-42401: array_append - append null") {
+    checkAnswer(
+      sql("select array_append(array('b', 'a', 'c'), cast(null as string))"),
+      Seq(Row(Seq("b", "a", "c", null)))
+    )
+  }
 }
 
 object DataFrameFunctionsSuite {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org