You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2023/11/15 11:28:22 UTC
(arrow-datafusion) branch main updated: Minor: Update JoinHashMap example to make it clearer (#8154)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 77a63260d1 Minor: Update JoinHashMap example to make it clearer (#8154)
77a63260d1 is described below
commit 77a63260d19306c4e9235694d1f38eae2bab9ef6
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Wed Nov 15 06:28:17 2023 -0500
Minor: Update JoinHashMap example to make it clearer (#8154)
---
datafusion/physical-plan/src/joins/hash_join.rs | 6 ++-
.../physical-plan/src/joins/hash_join_utils.rs | 55 +++++++++++-----------
2 files changed, 32 insertions(+), 29 deletions(-)
diff --git a/datafusion/physical-plan/src/joins/hash_join.rs b/datafusion/physical-plan/src/joins/hash_join.rs
index da57fa07cc..7a08b56a6e 100644
--- a/datafusion/physical-plan/src/joins/hash_join.rs
+++ b/datafusion/physical-plan/src/joins/hash_join.rs
@@ -649,6 +649,8 @@ impl ExecutionPlan for HashJoinExec {
}
}
+/// Reads the left (build) side of the input, buffering it in memory, to build a
+/// hash table (`LeftJoinData`)
async fn collect_left_input(
partition: Option<usize>,
random_state: RandomState,
@@ -842,7 +844,7 @@ impl RecordBatchStream for HashJoinStream {
/// # Example
///
/// For `LEFT.b1 = RIGHT.b2`:
-/// LEFT Table:
+/// LEFT (build) Table:
/// ```text
/// a1 b1 c1
/// 1 1 10
@@ -854,7 +856,7 @@ impl RecordBatchStream for HashJoinStream {
/// 13 10 130
/// ```
///
-/// RIGHT Table:
+/// RIGHT (probe) Table:
/// ```text
/// a2 b2 c2
/// 2 2 20
diff --git a/datafusion/physical-plan/src/joins/hash_join_utils.rs b/datafusion/physical-plan/src/joins/hash_join_utils.rs
index fecbf96f08..db65c8bf08 100644
--- a/datafusion/physical-plan/src/joins/hash_join_utils.rs
+++ b/datafusion/physical-plan/src/joins/hash_join_utils.rs
@@ -61,44 +61,45 @@ use hashbrown::HashSet;
///
/// ``` text
/// See the example below:
-/// Insert (1,1)
+///
+/// Insert (10,1) <-- insert hash value 10 with row index 1
/// map:
-/// ---------
-/// | 1 | 2 |
-/// ---------
+/// ----------
+/// | 10 | 2 |
+/// ----------
/// next:
/// ---------------------
/// | 0 | 0 | 0 | 0 | 0 |
/// ---------------------
-/// Insert (2,2)
+/// Insert (20,2)
/// map:
-/// ---------
-/// | 1 | 2 |
-/// | 2 | 3 |
-/// ---------
+/// ----------
+/// | 10 | 2 |
+/// | 20 | 3 |
+/// ----------
/// next:
/// ---------------------
/// | 0 | 0 | 0 | 0 | 0 |
/// ---------------------
-/// Insert (1,3)
+/// Insert (10,3) <-- collision! row index 3 has a hash value of 10 as well
/// map:
-/// ---------
-/// | 1 | 4 |
-/// | 2 | 3 |
-/// ---------
+/// ----------
+/// | 10 | 4 |
+/// | 20 | 3 |
+/// ----------
/// next:
/// ---------------------
-/// | 0 | 0 | 0 | 2 | 0 | <--- hash value 1 maps to 4,2 (which means indices values 3,1)
+/// | 0 | 0 | 0 | 2 | 0 | <--- hash value 10 maps to 4,2 (which means indices values 3,1)
/// ---------------------
-/// Insert (1,4)
+/// Insert (10,4) <-- another collision! row index 4 ALSO has a hash value of 10
/// map:
/// ---------
-/// | 1 | 5 |
-/// | 2 | 3 |
+/// | 10 | 5 |
+/// | 20 | 3 |
/// ---------
/// next:
/// ---------------------
-/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 1 maps to 5,4,2 (which means indices values 4,3,1)
+/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 10 maps to 5,4,2 (which means indices values 4,3,1)
/// ---------------------
/// ```
pub struct JoinHashMap {
@@ -124,7 +125,7 @@ impl JoinHashMap {
/// Trait defining methods that must be implemented by a hash map type to be used for joins.
pub trait JoinHashMapType {
- /// The type of list used to store the hash values.
+ /// The type of list used to store the next list
type NextType: IndexMut<usize, Output = u64>;
/// Extend with zero
fn extend_zero(&mut self, len: usize);
@@ -201,15 +202,15 @@ impl fmt::Debug for JoinHashMap {
/// Let's continue the example of `JoinHashMap` and then show how `PruningJoinHashMap` would
/// handle the pruning scenario.
///
-/// Insert the pair (1,4) into the `PruningJoinHashMap`:
+/// Insert the pair (10,4) into the `PruningJoinHashMap`:
/// map:
-/// ---------
-/// | 1 | 5 |
-/// | 2 | 3 |
-/// ---------
+/// ----------
+/// | 10 | 5 |
+/// | 20 | 3 |
+/// ----------
/// list:
/// ---------------------
-/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 1 maps to 5,4,2 (which means indices values 4,3,1)
+/// | 0 | 0 | 0 | 2 | 4 | <--- hash value 10 maps to 5,4,2 (which means indices values 4,3,1)
/// ---------------------
///
/// Now, let's prune 3 rows from `PruningJoinHashMap`:
@@ -219,7 +220,7 @@ impl fmt::Debug for JoinHashMap {
/// ---------
/// list:
/// ---------
-/// | 2 | 4 | <--- hash value 1 maps to 2 (5 - 3), 1 (4 - 3), NA (2 - 3) (which means indices values 1,0)
+/// | 2 | 4 | <--- hash value 10 maps to 2 (5 - 3), 1 (4 - 3), NA (2 - 3) (which means indices values 1,0)
/// ---------
///
/// After pruning, the | 2 | 3 | entry is deleted from `PruningJoinHashMap` since