You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by gu...@apache.org on 2023/02/15 11:38:54 UTC
[spark] branch branch-3.4 updated: [SPARK-42401][SQL][FOLLOWUP] Always set `containsNull=true` for `array_insert`

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-3.4
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.4 by this push:
     new 9e85162926a [SPARK-42401][SQL][FOLLOWUP] Always set `containsNull=true` for `array_insert`
9e85162926a is described below

commit 9e85162926a852b19aa1338e4ef3c8c33d02be0e
Author: Bruce Robbins <be...@gmail.com>
AuthorDate: Wed Feb 15 20:38:30 2023 +0900

    [SPARK-42401][SQL][FOLLOWUP] Always set `containsNull=true` for `array_insert`
    
    ### What changes were proposed in this pull request?
    
    Always set `containsNull=true` in the data type returned by `ArrayInsert#dataType`.
    
    ### Why are the changes needed?
    
    PR #39970 fixed an issue where the data type for `array_insert` did not always have `containsNull=true` when the user was explicitly inserting a nullable value into the array. However, that fix does not handle the case where `array_insert` implicitly inserts null values into the array (e.g., when the insertion position is out-of-range):
    ```
    spark-sql> select array_insert(array('1', '2', '3', '4'), -6, '5');
    23/02/14 16:10:19 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
    java.lang.NullPointerException
            at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.write(UnsafeWriter.java:110)
            at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
            at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
            at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
    ```
    Because we can't know at planning time whether the insertion position will be out of range, we should always set `containsNull=true` on the data type for `array_insert`.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    New unit tests.
    
    Closes #40026 from bersprockets/array_insert_null_anytime.
    
    Authored-by: Bruce Robbins <be...@gmail.com>
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
    (cherry picked from commit fb5b44f46b35562330e5e89133a0bca8e0bee36b)
    Signed-off-by: Hyukjin Kwon <gu...@apache.org>
---
 .../spark/sql/catalyst/expressions/collectionOperations.scala    | 2 +-
 .../sql/catalyst/expressions/CollectionExpressionsSuite.scala    | 9 ++++++++-
 .../scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala     | 9 ++++++++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 53d8ff160c0..28c4a9eba68 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -4840,7 +4840,7 @@ case class ArrayInsert(srcArrayExpr: Expression, posExpr: Expression, itemExpr:
   override def third: Expression = itemExpr
 
   override def prettyName: String = "array_insert"
-  override def dataType: DataType = if (third.nullable) first.dataType.asNullable else first.dataType
+  override def dataType: DataType = first.dataType.asNullable // out of range pos will add nulls
   override def nullable: Boolean = first.nullable | second.nullable
 
   @transient private lazy val elementType: DataType =
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 64b9c18605d..60300ba62f2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -2753,13 +2753,20 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
 
   }
 
-  test("SPARK-42401: Array insert of null value") {
+  test("SPARK-42401: Array insert of null value (explicit)") {
     val a = Literal.create(Seq("b", "a", "c"), ArrayType(StringType, false))
     checkEvaluation(ArrayInsert(
       a, Literal(2), Literal.create(null, StringType)), Seq("b", null, "a", "c")
     )
   }
 
+  test("SPARK-42401: Array insert of null value (implicit)") {
+    val a = Literal.create(Seq("b", "a", "c"), ArrayType(StringType, false))
+    checkEvaluation(ArrayInsert(
+      a, Literal(5), Literal.create("q", StringType)), Seq("b", "a", "c", null, "q")
+    )
+  }
+
   test("SPARK-42401: Array append of null value") {
     val a = Literal.create(Seq("b", "a", "c"), ArrayType(StringType, false))
     checkEvaluation(ArrayAppend(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 94f813a2c6b..bd03d292820 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -5432,13 +5432,20 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
     )
   }
 
-  test("SPARK-42401: array_insert - insert null") {
+  test("SPARK-42401: array_insert - explicitly insert null") {
     checkAnswer(
       sql("select array_insert(array('b', 'a', 'c'), 2, cast(null as string))"),
       Seq(Row(Seq("b", null, "a", "c")))
     )
   }
 
+  test("SPARK-42401: array_insert - implicitly insert null") {
+    checkAnswer(
+      sql("select array_insert(array('b', 'a', 'c'), 5, 'q')"),
+      Seq(Row(Seq("b", "a", "c", null, "q")))
+    )
+  }
+
   test("SPARK-42401: array_append - append null") {
     checkAnswer(
       sql("select array_append(array('b', 'a', 'c'), cast(null as string))"),


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org