You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by do...@apache.org on 2020/03/17 18:50:46 UTC

[spark] branch branch-3.0 updated: [SPARK-31171][SQL] size(null) should return null under ansi mode

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new b9371e3  [SPARK-31171][SQL] size(null) should return null under ansi mode
b9371e3 is described below

commit b9371e38abbc351106453b14172d6919be0eca82
Author: Wenchen Fan <we...@databricks.com>
AuthorDate: Tue Mar 17 11:48:54 2020 -0700

    [SPARK-31171][SQL] size(null) should return null under ansi mode
    
    Make `size(null)` return null under ANSI mode, regardless of the `spark.sql.legacy.sizeOfNull` config.
    
    In https://github.com/apache/spark/pull/27834, we change the result of `size(null)` to be -1 to match the 2.4 behavior and avoid breaking changes.
    
    However, it's true that the "return -1" behavior is error-prone when being used with aggregate functions. The current ANSI mode controls a bunch of "better behaviors" like failing on overflow. We don't enable these "better behaviors" by default because they are too breaking. The "return null" behavior of `size(null)` is a good fit of the ANSI mode.
    
    No as ANSI mode is off by default.
    
    new tests
    
    Closes #27936 from cloud-fan/null.
    
    Authored-by: Wenchen Fan <we...@databricks.com>
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
    (cherry picked from commit dc5ebc2d5b8122121d89a9175737bea95ae10126)
    Signed-off-by: Dongjoon Hyun <do...@apache.org>
---
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala   |  9 ++++++---
 .../catalyst/expressions/CollectionExpressionsSuite.scala    |  6 ++++++
 .../scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala | 12 ++++++++++++
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index e49593e..1331350 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1940,8 +1940,8 @@ object SQLConf {
 
   val LEGACY_SIZE_OF_NULL = buildConf("spark.sql.legacy.sizeOfNull")
     .internal()
-    .doc("If it is set to true, size of null returns -1. This behavior was inherited from Hive. " +
-      "The size function returns null for null input if the flag is disabled.")
+    .doc(s"If it is set to false, or ${ANSI_ENABLED.key} is true, then size of null returns " +
+      "null. Otherwise, it returns -1, which was inherited from Hive.")
     .booleanConf
     .createWithDefault(true)
 
@@ -2759,7 +2759,10 @@ class SQLConf extends Serializable with Logging {
 
   def csvColumnPruning: Boolean = getConf(SQLConf.CSV_PARSER_COLUMN_PRUNING)
 
-  def legacySizeOfNull: Boolean = getConf(SQLConf.LEGACY_SIZE_OF_NULL)
+  def legacySizeOfNull: Boolean = {
+    // size(null) should return null under ansi mode.
+    getConf(SQLConf.LEGACY_SIZE_OF_NULL) && !getConf(ANSI_ENABLED)
+  }
 
   def isReplEagerEvalEnabled: Boolean = getConf(SQLConf.REPL_EAGER_EVAL_ENABLED)
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 3cfc66f..173f248 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -74,6 +74,12 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
     withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "false") {
       testSize(sizeOfNull = null)
     }
+    // size(null) should return null under ansi mode.
+    withSQLConf(
+      SQLConf.LEGACY_SIZE_OF_NULL.key -> "true",
+      SQLConf.ANSI_ENABLED.key -> "true") {
+      testSize(sizeOfNull = null)
+    }
   }
 
   test("MapKeys/MapValues") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index a613c33..c41eb98 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -490,6 +490,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
     withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "false") {
       testSizeOfArray(sizeOfNull = null)
     }
+    // size(null) should return null under ansi mode.
+    withSQLConf(
+      SQLConf.LEGACY_SIZE_OF_NULL.key -> "true",
+      SQLConf.ANSI_ENABLED.key -> "true") {
+      testSizeOfArray(sizeOfNull = null)
+    }
   }
 
   test("dataframe arrays_zip function") {
@@ -569,6 +575,12 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
     withSQLConf(SQLConf.LEGACY_SIZE_OF_NULL.key -> "false") {
       testSizeOfMap(sizeOfNull = null)
     }
+    // size(null) should return null under ansi mode.
+    withSQLConf(
+      SQLConf.LEGACY_SIZE_OF_NULL.key -> "true",
+      SQLConf.ANSI_ENABLED.key -> "true") {
+      testSizeOfMap(sizeOfNull = null)
+    }
   }
 
   test("map_keys/map_values function") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org