You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2021/02/24 13:34:40 UTC

[spark] branch branch-3.1 updated: [SPARK-34515][SQL] Fix NPE if InSet contains null value during getPartitionsByFilter

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 8daa317  [SPARK-34515][SQL] Fix NPE if InSet contains null value during getPartitionsByFilter
8daa317 is described below

commit 8daa317b77892af6ab6ff19f2147eb722fcc1d78
Author: ulysses-you <ul...@gmail.com>
AuthorDate: Wed Feb 24 21:32:19 2021 +0800

    [SPARK-34515][SQL] Fix NPE if InSet contains null value during getPartitionsByFilter
    
    ### What changes were proposed in this pull request?
    
    Skip null value during rewrite `InSet` to `>= and <=` at getPartitionsByFilter.
    
    ### Why are the changes needed?
    
    Spark will convert `InSet` to `>= and <=` if it's values size over `spark.sql.hive.metastorePartitionPruningInSetThreshold` during pruning partition . At this case, if values contain a null, we will get such exception 
     
    ```
    java.lang.NullPointerException
     at org.apache.spark.unsafe.types.UTF8String.compareTo(UTF8String.java:1389)
     at org.apache.spark.unsafe.types.UTF8String.compareTo(UTF8String.java:50)
     at scala.math.LowPriorityOrderingImplicits$$anon$3.compare(Ordering.scala:153)
     at java.util.TimSort.countRunAndMakeAscending(TimSort.java:355)
     at java.util.TimSort.sort(TimSort.java:220)
     at java.util.Arrays.sort(Arrays.java:1438)
     at scala.collection.SeqLike.sorted(SeqLike.scala:659)
     at scala.collection.SeqLike.sorted$(SeqLike.scala:647)
     at scala.collection.AbstractSeq.sorted(Seq.scala:45)
     at org.apache.spark.sql.hive.client.Shim_v0_13.convert$1(HiveShim.scala:772)
     at org.apache.spark.sql.hive.client.Shim_v0_13.$anonfun$convertFilters$4(HiveShim.scala:826)
     at scala.collection.immutable.Stream.flatMap(Stream.scala:489)
     at org.apache.spark.sql.hive.client.Shim_v0_13.convertFilters(HiveShim.scala:826)
     at org.apache.spark.sql.hive.client.Shim_v0_13.getPartitionsByFilter(HiveShim.scala:848)
     at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$getPartitionsByFilter$1(HiveClientImpl.scala:750)
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, bug fix.
    
    ### How was this patch tested?
    
    Add test.
    
    Closes #31632 from ulysses-you/SPARK-34515.
    
    Authored-by: ulysses-you <ul...@gmail.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit 999d3b89b6df14a5ccb94ffc2ffadb82964e9f7d)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../main/scala/org/apache/spark/sql/hive/client/HiveShim.scala    | 4 +++-
 .../scala/org/apache/spark/sql/hive/client/FiltersSuite.scala     | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index ed08864..8ccb17c 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -769,7 +769,9 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
       case InSet(child, values) if useAdvanced && values.size > inSetThreshold =>
         val dataType = child.dataType
-        val sortedValues = values.toSeq.sorted(TypeUtils.getInterpretedOrdering(dataType))
+        // Skip null here is safe, more details could see at ExtractableLiterals.
+        val sortedValues = values.filter(_ != null).toSeq
+          .sorted(TypeUtils.getInterpretedOrdering(dataType))
         convert(And(GreaterThanOrEqual(child, Literal(sortedValues.head, dataType)),
           LessThanOrEqual(child, Literal(sortedValues.last, dataType))))
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
index 12ed0e5..6962f9d 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -179,5 +179,13 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest {
     }
   }
 
+  test("SPARK-34515: Fix NPE if InSet contains null value during getPartitionsByFilter") {
+    withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING_INSET_THRESHOLD.key -> "2") {
+      val filter = InSet(a("p", IntegerType), Set(null, 1, 2))
+      val converted = shim.convertFilters(testTable, Seq(filter), conf.sessionLocalTimeZone)
+      assert(converted == "(p >= 1 and p <= 2)")
+    }
+  }
+
   private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)()
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org