You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/12/18 23:17:39 UTC

[GitHub] [arrow] Dandandan commented on a change in pull request #8965: ARROW-10968: [Rust][DataFusion] Don't build hash table for right side of join

Dandandan commented on a change in pull request #8965:
URL: https://github.com/apache/arrow/pull/8965#discussion_r546142043



##########
File path: rust/datafusion/src/physical_plan/hash_join.rs
##########
@@ -423,71 +419,87 @@ fn build_batch(
 // (1, 0)     (1, 2)
 fn build_join_indexes(
     left: &JoinHashMap,
-    right: &JoinHashMap,
+    right: &RecordBatch,
     join_type: &JoinType,
+    on: &HashSet<String>,
 ) -> Result<Vec<(JoinIndex, JoinIndex)>> {
+    let keys_values = on
+        .iter()
+        .map(|name| Ok(col(name).evaluate(right)?.into_array(right.num_rows())))
+        .collect::<Result<Vec<_>>>()?;
+
+    let mut key = Vec::with_capacity(keys_values.len());
+
     match join_type {
         JoinType::Inner => {
-            // inner => key intersection
-            // unfortunately rust does not support intersection of map keys :(
-            let left_set: HashSet<Vec<u8>> = left.keys().cloned().collect();
-            let left_right: HashSet<Vec<u8>> = right.keys().cloned().collect();
-            let inner = left_set.intersection(&left_right);
-
             let mut indexes = Vec::new(); // unknown a prior size
-            for key in inner {
-                // the unwrap never happens by construction of the key
-                let left_indexes = left.get(key).unwrap();
-                let right_indexes = right.get(key).unwrap();
+
+            // Visit all of the right rows
+            for row in 0..right.num_rows() {
+                // Get the key and find it in the build index
+                create_key(&keys_values, row, &mut key)?;
+                let left_indexes = left.get(&key);
 
                 // for every item on the left and right with this key, add the respective pair
-                left_indexes.iter().for_each(|x| {
-                    right_indexes.iter().for_each(|y| {
-                        // on an inner join, left and right indices are present
-                        indexes.push((Some(*x), Some(*y)));
-                    })
+                left_indexes.unwrap_or(&vec![]).iter().for_each(|x| {
+                    // on an inner join, left and right indices are present
+                    indexes.push((Some(*x), Some((0, row))));
                 })
             }
             Ok(indexes)
         }
         JoinType::Left => {
-            // left => left keys
             let mut indexes = Vec::new(); // unknown a prior size
-            for (key, left_indexes) in left {
-                // for every item on the left and right with this key, add the respective pair
-                if let Some(right_indexes) = right.get(key) {
-                    left_indexes.iter().for_each(|x| {
-                        right_indexes.iter().for_each(|y| {
-                            // on an inner join, left and right indices are present
-                            indexes.push((Some(*x), Some(*y)));
+
+            // Keep track of which item is visited in the build input
+            // TODO: this can be stored more efficiently with a marker
+            let mut is_visited = HashSet::new();
+
+            // First visit all of the rows
+            for row in 0..right.num_rows() {
+                create_key(&keys_values, row, &mut key)?;
+                // the unwrap never happens by construction of the key
+                let left_indexes = left.get(&key);
+
+                match left_indexes {
+                    Some(indices) => {
+                        is_visited.insert(key.clone());
+
+                        indices.iter().for_each(|x| {
+                            indexes.push((Some(*x), Some((0, row))));
                         })
-                    })
-                } else {
-                    // key not on the right => push Nones
-                    left_indexes.iter().for_each(|x| {

Review comment:
       Isn't this wrong already? Shouldn't it visit all batches before adding nulls for the left side?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org