You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by we...@apache.org on 2022/03/28 05:44:57 UTC
[spark] branch branch-3.3 updated: [SPARK-38623][SQL] Add more comments and tests for HashShuffleSpec

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.3
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.3 by this push:
     new cc85b1e  [SPARK-38623][SQL] Add more comments and tests for HashShuffleSpec
cc85b1e is described below

commit cc85b1ee138eeb2ea9aca9c545d48ab2c5b49c1c
Author: Wenchen Fan <we...@databricks.com>
AuthorDate: Mon Mar 28 13:43:31 2022 +0800

    [SPARK-38623][SQL] Add more comments and tests for HashShuffleSpec
    
    ### What changes were proposed in this pull request?
    
    Add more comments and tests to explain the special handling of duplicated cluster keys.
    
    ### Why are the changes needed?
    
    improve code readability
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    existing tests.
    
    Closes #35937 from cloud-fan/join.
    
    Authored-by: Wenchen Fan <we...@databricks.com>
    Signed-off-by: Wenchen Fan <we...@databricks.com>
    (cherry picked from commit c0cb5bce6623e98fa5161c1e3e866e730de87fa5)
    Signed-off-by: Wenchen Fan <we...@databricks.com>
---
 .../apache/spark/sql/catalyst/plans/physical/partitioning.scala   | 7 ++++++-
 .../scala/org/apache/spark/sql/catalyst/ShuffleSpecSuite.scala    | 8 ++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
index 78d153c..e4ff14b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/physical/partitioning.scala
@@ -516,6 +516,11 @@ case class HashShuffleSpec(
    * A sequence where each element is a set of positions of the hash partition key to the cluster
    * keys. For instance, if cluster keys are [a, b, b] and hash partition keys are [a, b], the
    * result will be [(0), (1, 2)].
+   *
+   * This is useful to check compatibility between two `HashShuffleSpec`s. If the cluster keys are
+   * [a, b, b] and [x, y, z] for the two join children, and the hash partition keys are
+   * [a, b] and [x, z], they are compatible. With the positions, we can do the compatibility check
+   * by looking at if the positions of hash partition keys from two sides have overlapping.
    */
   lazy val hashKeyPositions: Seq[mutable.BitSet] = {
     val distKeyToPos = mutable.Map.empty[Expression, mutable.BitSet]
@@ -533,7 +538,7 @@ case class HashShuffleSpec(
       //  1. both distributions have the same number of clustering expressions
       //  2. both partitioning have the same number of partitions
       //  3. both partitioning have the same number of expressions
-      //  4. each pair of expression from both has overlapping positions in their
+      //  4. each pair of partitioning expression from both sides has overlapping positions in their
       //     corresponding distributions.
       distribution.clustering.length == otherDistribution.clustering.length &&
       partitioning.numPartitions == otherPartitioning.numPartitions &&
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ShuffleSpecSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ShuffleSpecSuite.scala
index 74ec949..7e11d4f 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ShuffleSpecSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/ShuffleSpecSuite.scala
@@ -92,6 +92,14 @@ class ShuffleSpecSuite extends SparkFunSuite with SQLHelper {
     )
 
     checkCompatible(
+      HashShuffleSpec(HashPartitioning(Seq($"a", $"b"), 10),
+        ClusteredDistribution(Seq($"a", $"b", $"b"))),
+      HashShuffleSpec(HashPartitioning(Seq($"a", $"d"), 10),
+        ClusteredDistribution(Seq($"a", $"c", $"d"))),
+      expected = true
+    )
+
+    checkCompatible(
       HashShuffleSpec(HashPartitioning(Seq($"a", $"b", $"a"), 10),
         ClusteredDistribution(Seq($"a", $"b", $"b"))),
       HashShuffleSpec(HashPartitioning(Seq($"a", $"c", $"a"), 10),

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org