You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by dh...@apache.org on 2021/08/10 06:50:26 UTC

[arrow-datafusion] branch master updated: Fix right, full join handling when having multiple non-matching rows at the left side (#845)

This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 5cadc6a  Fix right, full join handling when having multiple non-matching rows at the left side (#845)
5cadc6a is described below

commit 5cadc6a45a7a58db8c1c9b2c4834953233d365e3
Author: Daniƫl Heres <da...@gmail.com>
AuthorDate: Tue Aug 10 08:50:19 2021 +0200

    Fix right, full join handling when having multiple non-matching rows at the left side (#845)
    
    * Fix right, full join handling
    
    * Remove comment
    
    * Clippy
    
    * Update datafusion/src/physical_plan/hash_join.rs
    
    Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
    
    * Fmt
    
    Co-authored-by: Andrew Lamb <an...@nerdnetworks.org>
---
 datafusion/src/physical_plan/hash_join.rs | 20 +++++++++-----------
 datafusion/tests/sql.rs                   |  6 ------
 2 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/datafusion/src/physical_plan/hash_join.rs b/datafusion/src/physical_plan/hash_join.rs
index fa75437..9970824 100644
--- a/datafusion/src/physical_plan/hash_join.rs
+++ b/datafusion/src/physical_plan/hash_join.rs
@@ -737,6 +737,7 @@ fn build_join_indexes(
             for (row, hash_value) in hash_values.iter().enumerate() {
                 match left.0.get(*hash_value, |(hash, _)| *hash_value == *hash) {
                     Some((_, indices)) => {
+                        let mut no_match = true;
                         for &i in indices {
                             if equal_rows(
                                 i as usize,
@@ -745,9 +746,14 @@ fn build_join_indexes(
                                 &keys_values,
                             )? {
                                 left_indices.append_value(i)?;
-                            } else {
-                                left_indices.append_null()?;
+                                right_indices.append_value(row as u32)?;
+                                no_match = false;
                             }
+                        }
+                        // If no rows matched left, still must keep the right
+                        // with all nulls for left
+                        if no_match {
+                            left_indices.append_null()?;
                             right_indices.append_value(row as u32)?;
                         }
                     }
@@ -768,7 +774,7 @@ macro_rules! equal_rows_elem {
         let left_array = $l.as_any().downcast_ref::<$array_type>().unwrap();
         let right_array = $r.as_any().downcast_ref::<$array_type>().unwrap();
 
-        match (left_array.is_null($left), left_array.is_null($right)) {
+        match (left_array.is_null($left), right_array.is_null($right)) {
             (false, false) => left_array.value($left) == right_array.value($right),
             _ => false,
         }
@@ -1372,8 +1378,6 @@ mod tests {
     }
 
     #[tokio::test]
-    // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
-    #[cfg(not(feature = "force_hash_collisions"))]
     async fn join_full_multi_batch() {
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
@@ -1639,8 +1643,6 @@ mod tests {
     }
 
     #[tokio::test]
-    // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
-    #[cfg(not(feature = "force_hash_collisions"))]
     async fn join_right_one() -> Result<()> {
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
@@ -1677,8 +1679,6 @@ mod tests {
     }
 
     #[tokio::test]
-    // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
-    #[cfg(not(feature = "force_hash_collisions"))]
     async fn partitioned_join_right_one() -> Result<()> {
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
@@ -1716,8 +1716,6 @@ mod tests {
     }
 
     #[tokio::test]
-    // Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
-    #[cfg(not(feature = "force_hash_collisions"))]
     async fn join_full_one() -> Result<()> {
         let left = build_table(
             ("a1", &vec![1, 2, 3]),
diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs
index 046e4f2..0c33bd4 100644
--- a/datafusion/tests/sql.rs
+++ b/datafusion/tests/sql.rs
@@ -1797,8 +1797,6 @@ async fn equijoin_left_and_condition_from_right() -> Result<()> {
 }
 
 #[tokio::test]
-// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
-#[cfg(not(feature = "force_hash_collisions"))]
 async fn equijoin_right_and_condition_from_left() -> Result<()> {
     let mut ctx = create_join_context("t1_id", "t2_id")?;
     let sql =
@@ -1852,8 +1850,6 @@ async fn left_join() -> Result<()> {
 }
 
 #[tokio::test]
-// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
-#[cfg(not(feature = "force_hash_collisions"))]
 async fn right_join() -> Result<()> {
     let mut ctx = create_join_context("t1_id", "t2_id")?;
     let equivalent_sql = [
@@ -1874,8 +1870,6 @@ async fn right_join() -> Result<()> {
 }
 
 #[tokio::test]
-// Disable until https://github.com/apache/arrow-datafusion/issues/843 fixed
-#[cfg(not(feature = "force_hash_collisions"))]
 async fn full_join() -> Result<()> {
     let mut ctx = create_join_context("t1_id", "t2_id")?;
     let equivalent_sql = [