You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/11/15 11:28:22 UTC

(arrow-datafusion) branch main updated: Minor: Update JoinHashMap example to make it clearer (#8154)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 77a63260d1 Minor: Update JoinHashMap example to make it clearer (#8154)
77a63260d1 is described below

commit 77a63260d19306c4e9235694d1f38eae2bab9ef6
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Wed Nov 15 06:28:17 2023 -0500

    Minor: Update JoinHashMap example to make it clearer (#8154)
---
 datafusion/physical-plan/src/joins/hash_join.rs    |  6 ++-
 .../physical-plan/src/joins/hash_join_utils.rs     | 55 +++++++++++-----------
 2 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs
index da57fa07cc..7a08b56a6e 100644
--- a/datafusion/physical-plan/src/joins/hash_join.rs
+++ b/datafusion/physical-plan/src/joins/hash_join.rs
@@ -649,6 +649,8 @@ impl ExecutionPlan for HashJoinExec {
     }
 }
 
+/// Reads the left (build) side of the input, buffering it in memory, to build a
+/// hash table (`LeftJoinData`)
 async fn collect_left_input(
     partition: Option<usize>,
     random_state: RandomState,
@@ -842,7 +844,7 @@ impl RecordBatchStream for HashJoinStream {
 /// # Example
 ///
 /// For `LEFT.b1 = RIGHT.b2`:
-/// LEFT Table:
+/// LEFT (build) Table:
 /// ```text
 ///  a1  b1  c1
 ///  1   1   10
@@ -854,7 +856,7 @@ impl RecordBatchStream for HashJoinStream {
 ///  13   10  130
 /// ```
 ///
-/// RIGHT Table:
+/// RIGHT (probe) Table:
 /// ```text
 ///  a2   b2  c2
 ///  2    2   20
diff --git a/datafusion/physical-plan/src/joins/hash_join_utils.rs b/datafusion/physical-plan/src/joins/hash_join_utils.rs
index fecbf96f08..db65c8bf08 100644
--- a/datafusion/physical-plan/src/joins/hash_join_utils.rs
+++ b/datafusion/physical-plan/src/joins/hash_join_utils.rs
@@ -61,44 +61,45 @@ use hashbrown::HashSet;
 ///
 /// ``` text
 /// See the example below:
-/// Insert (1,1)
+///
+/// Insert (10,1)            <-- insert hash value 10 with row index 1
 /// map:
-/// ---------
-/// | 1 | 2 |
-/// ---------
+/// ----------
+/// | 10 | 2 |
+/// ----------
 /// next:
 /// ---------------------
 /// | 0 | 0 | 0 | 0 | 0 |
 /// ---------------------
-/// Insert (2,2)
+/// Insert (20,2)
 /// map:
-/// ---------
-/// | 1 | 2 |
-/// | 2 | 3 |
-/// ---------
+/// ----------
+/// | 10 | 2 |
+/// | 20 | 3 |
+/// ----------
 /// next:
 /// ---------------------
 /// | 0 | 0 | 0 | 0 | 0 |
 /// ---------------------
-/// Insert (1,3)
+/// Insert (10,3)           <-- collision! row index 3 has a hash value of 10 as well
 /// map:
-/// ---------
-/// | 1 | 4 |
-/// | 2 | 3 |
-/// ---------
+/// ----------
+/// | 10 | 4 |
+/// | 20 | 3 |
+/// ----------
 /// next:
 /// ---------------------
-/// | 0 | 0 | 0 | 2 | 0 |  <--- hash value 1 maps to 4,2 (which means indices values 3,1)
+/// | 0 | 0 | 0 | 2 | 0 |  <--- hash value 10 maps to 4,2 (which means indices values 3,1)
 /// ---------------------
-/// Insert (1,4)
+/// Insert (10,4)          <-- another collision! row index 4 ALSO has a hash value of 10
 /// map:
 /// ---------
-/// | 1 | 5 |
-/// | 2 | 3 |
+/// | 10 | 5 |
+/// | 20 | 3 |
 /// ---------
 /// next:
 /// ---------------------
-/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 1 maps to 5,4,2 (which means indices values 4,3,1)
+/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 10 maps to 5,4,2 (which means indices values 4,3,1)
 /// ---------------------
 /// ```
 pub struct JoinHashMap {
@@ -124,7 +125,7 @@ impl JoinHashMap {
 
 /// Trait defining methods that must be implemented by a hash map type to be used for joins.
 pub trait JoinHashMapType {
-    /// The type of list used to store the hash values.
+    /// The type of list used to store the next list
     type NextType: IndexMut<usize, Output = u64>;
     /// Extend with zero
     fn extend_zero(&mut self, len: usize);
@@ -201,15 +202,15 @@ impl fmt::Debug for JoinHashMap {
 /// Let's continue the example of `JoinHashMap` and then show how `PruningJoinHashMap` would
 /// handle the pruning scenario.
 ///
-/// Insert the pair (1,4) into the `PruningJoinHashMap`:
+/// Insert the pair (10,4) into the `PruningJoinHashMap`:
 /// map:
-/// ---------
-/// | 1 | 5 |
-/// | 2 | 3 |
-/// ---------
+/// ----------
+/// | 10 | 5 |
+/// | 20 | 3 |
+/// ----------
 /// list:
 /// ---------------------
-/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 1 maps to 5,4,2 (which means indices values 4,3,1)
+/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 10 maps to 5,4,2 (which means indices values 4,3,1)
 /// ---------------------
 ///
 /// Now, let's prune 3 rows from `PruningJoinHashMap`:
@@ -219,7 +220,7 @@ impl fmt::Debug for JoinHashMap {
 /// ---------
 /// list:
 /// ---------
-/// | 2 | 4 | <--- hash value 1 maps to 2 (5 - 3), 1 (4 - 3), NA (2 - 3) (which means indices values 1,0)
+/// | 2 | 4 | <--- hash value 10 maps to 2 (5 - 3), 1 (4 - 3), NA (2 - 3) (which means indices values 1,0)
 /// ---------
 ///
 /// After pruning, the | 2 | 3 | entry is deleted from `PruningJoinHashMap` since