You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2021/02/01 14:19:38 UTC
[spark] branch branch-3.1 updated: [SPARK-34233][SQL][3.1] FIX NPE for char padding in binary comparison

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new c1999fb  [SPARK-34233][SQL][3.1] FIX NPE for char padding in binary comparison
c1999fb is described below

commit c1999fba9dd206394666ee9cb15749139c0c1f67
Author: Kent Yao <ya...@apache.org>
AuthorDate: Mon Feb 1 14:19:07 2021 +0000

    [SPARK-34233][SQL][3.1] FIX NPE for char padding in binary comparison
    
    As mentioned at https://github.com/apache/spark/pull/31336#issuecomment-768786706 the PR https://github.com/apache/spark/pull/31336 has been reverted to from branch 3.1 due to test failures, this pr fixes for that,
    
    the test failure is due to another patch is missing in branch 3.1, so in this pr, we wait for https://github.com/apache/spark/commit/fc3f22645e5c542e80a086d96da384feb6afe121 to be backport first
     ### What changes were proposed in this pull request?
    
    we need to check whether the `lit` is null  before calling `numChars`
    
    ### Why are the changes needed?
    
    fix an obvious NPE bug
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    new tests
    
    Closes #31407 from yaooqinn/npe.
    
    Lead-authored-by: Kent Yao <ya...@apache.org>
    Co-authored-by: Max Gekk <ma...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../spark/sql/catalyst/analysis/Analyzer.scala     | 22 +++++++----
 .../apache/spark/sql/CharVarcharTestSuite.scala    | 43 +++++++++++++++++++++-
 2 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index fb95323a..6fd6901 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -3888,13 +3888,15 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
           if attr.dataType == StringType && list.forall(_.foldable) =>
           CharVarcharUtils.getRawType(attr.metadata).flatMap {
             case CharType(length) =>
-              val literalCharLengths = list.map(_.eval().asInstanceOf[UTF8String].numChars())
+              val (nulls, literalChars) =
+                list.map(_.eval().asInstanceOf[UTF8String]).partition(_ == null)
+              val literalCharLengths = literalChars.map(_.numChars())
               val targetLen = (length +: literalCharLengths).max
               Some(i.copy(
                 value = addPadding(attr, length, targetLen),
                 list = list.zip(literalCharLengths).map {
                   case (lit, charLength) => addPadding(lit, charLength, targetLen)
-                }))
+                } ++ nulls.map(Literal.create(_, StringType))))
             case _ => None
           }.getOrElse(i)
 
@@ -3915,13 +3917,17 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
       CharVarcharUtils.getRawType(attr.metadata).flatMap {
         case CharType(length) =>
           val str = lit.eval().asInstanceOf[UTF8String]
-          val stringLitLen = str.numChars()
-          if (length < stringLitLen) {
-            Some(Seq(StringRPad(attr, Literal(stringLitLen)), lit))
-          } else if (length > stringLitLen) {
-            Some(Seq(attr, StringRPad(lit, Literal(length))))
-          } else {
+          if (str == null) {
             None
+          } else {
+            val stringLitLen = str.numChars()
+            if (length < stringLitLen) {
+              Some(Seq(StringRPad(attr, Literal(stringLitLen)), lit))
+            } else if (length > stringLitLen) {
+              Some(Seq(attr, StringRPad(lit, Literal(length))))
+            } else {
+              None
+            }
           }
         case _ => None
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
index ff8820a..744757b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
@@ -152,6 +152,22 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils {
     }
   }
 
+  test("SPARK-34233: char/varchar with null value for partitioned columns") {
+    Seq("CHAR(5)", "VARCHAR(5)").foreach { typ =>
+      withTable("t") {
+        sql(s"CREATE TABLE t(i STRING, c $typ) USING $format PARTITIONED BY (c)")
+        sql("INSERT INTO t VALUES ('1', null)")
+        checkPlainResult(spark.table("t"), typ, null)
+        sql("INSERT OVERWRITE t VALUES ('1', null)")
+        checkPlainResult(spark.table("t"), typ, null)
+        sql("INSERT OVERWRITE t PARTITION (c=null) VALUES ('1')")
+        checkPlainResult(spark.table("t"), typ, null)
+        sql("ALTER TABLE t DROP PARTITION(c=null)")
+        checkAnswer(spark.table("t"), Nil)
+      }
+    }
+  }
+
   test("char/varchar type values length check: partitioned columns of other types") {
     // DSV2 doesn't support DROP PARTITION yet.
     assume(format != "foo")
@@ -435,7 +451,8 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils {
         ("c1 IN ('a', 'b')", true),
         ("c1 = c2", true),
         ("c1 < c2", false),
-        ("c1 IN (c2)", true)))
+        ("c1 IN (c2)", true),
+        ("c1 <=> null", false)))
     }
   }
 
@@ -451,7 +468,29 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils {
         ("c1 IN ('a', 'b')", true),
         ("c1 = c2", true),
         ("c1 < c2", false),
-        ("c1 IN (c2)", true)))
+        ("c1 IN (c2)", true),
+        ("c1 <=> null", false)))
+    }
+  }
+
+  private def testNullConditions(df: DataFrame, conditions: Seq[String]): Unit = {
+    conditions.foreach { cond =>
+      checkAnswer(df.selectExpr(cond), Row(null))
+    }
+  }
+
+  test("SPARK-34233: char type comparison with null values") {
+    val conditions = Seq("c = null", "c IN ('e', null)", "c IN (null)")
+    withTable("t") {
+      sql(s"CREATE TABLE t(c CHAR(2)) USING $format")
+      sql("INSERT INTO t VALUES ('a')")
+      testNullConditions(spark.table("t"), conditions)
+    }
+
+    withTable("t") {
+      sql(s"CREATE TABLE t(i INT, c CHAR(2)) USING $format PARTITIONED BY (c)")
+      sql("INSERT INTO t VALUES (1, 'a')")
+      testNullConditions(spark.table("t"), conditions)
     }
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org