You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2022/08/29 07:26:00 UTC
[spark] branch master updated: [SPARK-40247][SQL] Fix BitSet equality check

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 527ddece8fd [SPARK-40247][SQL] Fix BitSet equality check
527ddece8fd is described below

commit 527ddece8fdbe703dcd239401c97ddb2c6122182
Author: Peter Toth <pt...@cloudera.com>
AuthorDate: Mon Aug 29 15:25:39 2022 +0800

    [SPARK-40247][SQL] Fix BitSet equality check
    
    ### What changes were proposed in this pull request?
    Spark's `BitSet` doesn't implement `equals()` and `hashCode()` but it is used in `FileSourceScanExec` for bucket pruning.
    
    ### Why are the changes needed?
    Without proper equality check reuse issues can occur.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Added new UT.
    
    Closes #37696 from peter-toth/SPARK-40247-fix-bitset-equals.
    
    Authored-by: Peter Toth <pt...@cloudera.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../org/apache/spark/util/collection/BitSet.scala  |  9 ++++++++
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala | 25 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
index 61386114997..6bb5058f5ed 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/BitSet.scala
@@ -250,4 +250,13 @@ class BitSet(numBits: Int) extends Serializable {
 
   /** Return the number of longs it would take to hold numBits. */
   private def bit2words(numBits: Int) = ((numBits - 1) >> 6) + 1
+
+  override def equals(other: Any): Boolean = other match {
+    case otherSet: BitSet => Arrays.equals(words, otherSet.words)
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    Arrays.hashCode(words)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index cbd65ede054..0fa2c7195db 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -4003,6 +4003,31 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
     }
   }
 
+  test("SPARK-40247: Fix BitSet equals") {
+    withTable("td") {
+      testData
+        .withColumn("bucket", $"key" % 3)
+        .write
+        .mode(SaveMode.Overwrite)
+        .bucketBy(2, "bucket")
+        .format("parquet")
+        .saveAsTable("td")
+      val df = sql(
+        """
+          |SELECT t1.key, t2.key, t3.key
+          |FROM td AS t1
+          |JOIN td AS t2 ON t2.key = t1.key
+          |JOIN td AS t3 ON t3.key = t2.key
+          |WHERE t1.bucket = 1 AND t2.bucket = 1 AND t3.bucket = 1
+          |""".stripMargin)
+      df.collect()
+      val reusedExchanges = collect(df.queryExecution.executedPlan) {
+        case r: ReusedExchangeExec => r
+      }
+      assert(reusedExchanges.size == 1)
+    }
+  }
+
   test("SPARK-35331: Fix resolving original expression in RepartitionByExpression after aliased") {
     Seq("CLUSTER", "DISTRIBUTE").foreach { keyword =>
       Seq("a", "substr(a, 0, 3)").foreach { expr =>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org