You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/06/13 12:06:06 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #2699: Sort preserving `SortMergeJoin`

alamb commented on code in PR #2699:
URL: https://github.com/apache/arrow-datafusion/pull/2699#discussion_r895637134


##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1414,7 +1407,7 @@ mod tests {
             "| 3  | 5  | 9  | 20 | 5  | 80 |",
             "+----+----+----+----+----+----+",
         ];
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   I think it might be worth a note here explaining that the sort order of the output is important in case someone might try and change it in the future:
   
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1489,7 +1482,7 @@ mod tests {
             "| 1  | 1  | 8  | 1  | 1  | 80 |",
             "+----+----+----+----+----+----+",
         ];
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1451,7 +1444,7 @@ mod tests {
             "| 2  | 2  | 9  | 2  | 2  | 80 |",
             "+----+----+----+----+----+----+",
         ];
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1619,7 +1611,7 @@ mod tests {
         assert_eq!(batches.len(), 2);
         assert_eq!(batches[0].num_rows(), 2);
         assert_eq!(batches[1].num_rows(), 1);
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```
   



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1650,7 +1642,7 @@ mod tests {
             "| 3  | 7  | 9  |    |    |    |",
             "+----+----+----+----+----+----+",
         ];
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1774,7 +1766,7 @@ mod tests {
             "| 2  | 5  | 8  |",
             "+----+----+----+",
         ];
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1838,7 +1830,7 @@ mod tests {
             "| 1970-01-04 | 2022-04-26 | 1970-01-10 | 1970-01-21 | 2022-04-26 | 1970-03-22 |",
             "+------------+------------+------------+------------+------------+------------+",
         ];
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1743,7 +1735,7 @@ mod tests {
             "| 5  | 7  | 11 |",
             "+----+----+----+",
         ];
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1571,14 +1564,13 @@ mod tests {
             "+----+----+----+----+----+----+",
             "| a1 | b2 | c1 | a1 | b2 | c2 |",
             "+----+----+----+----+----+----+",
-            "| 1  |    | 1  | 1  |    | 10 |",
-            "| 1  | 1  |    | 1  | 1  | 70 |",
-            "| 2  | 2  | 8  | 2  | 2  | 80 |",
             "| 2  | 2  | 9  | 2  | 2  | 80 |",
+            "| 2  | 2  | 8  | 2  | 2  | 80 |",
+            "| 1  | 1  |    | 1  | 1  | 70 |",
+            "| 1  |    | 1  | 1  |    | 10 |",
             "+----+----+----+----+----+----+",
         ];
-        //assert_eq!(batches.len(), 1);
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1526,7 +1519,7 @@ mod tests {
             "| 2  | 2  | 9  | 2  | 2  | 80 |",
             "+----+----+----+----+----+----+",
         ];
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1805,7 +1797,7 @@ mod tests {
             "| 2 | 5 | 8 | 20 | 2 | 80 |",
             "+---+---+---+----+---+----+",
         ];
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1676,12 +1668,12 @@ mod tests {
             "+----+----+----+----+----+----+",
             "| a1 | b1 | c1 | a2 | b1 | c2 |",
             "+----+----+----+----+----+----+",
-            "|    |    |    | 30 | 6  | 90 |",
             "| 1  | 4  | 7  | 10 | 4  | 70 |",
             "| 2  | 5  | 8  | 20 | 5  | 80 |",
+            "|    |    |    | 30 | 6  | 90 |",
             "+----+----+----+----+----+----+",
         ];
-        assert_batches_sorted_eq!(expected, &batches);
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1871,6 +1863,223 @@ mod tests {
             "| 1970-01-01 | 2022-04-25 | 1970-01-01 | 1970-01-01 | 2022-04-25 | 1970-01-01 |",
             "+------------+------------+------------+------------+------------+------------+",
         ];
+        assert_batches_eq!(expected, &batches);

Review Comment:
   ```suggestion
           // The output order is important as SMJ preserves sortedness
           assert_batches_eq!(expected, &batches);
   ```



##########
datafusion/core/src/physical_plan/sort_merge_join.rs:
##########
@@ -1871,6 +1863,223 @@ mod tests {
             "| 1970-01-01 | 2022-04-25 | 1970-01-01 | 1970-01-01 | 2022-04-25 | 1970-01-01 |",
             "+------------+------------+------------+------------+------------+------------+",
         ];
+        assert_batches_eq!(expected, &batches);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_left_sort_order() -> Result<()> {
+        let left = build_table(
+            ("a1", &vec![0, 1, 2, 3, 4, 5]),
+            ("b1", &vec![3, 4, 5, 6, 6, 7]),
+            ("c1", &vec![4, 5, 6, 7, 8, 9]),
+        );
+        let right = build_table(
+            ("a2", &vec![0, 10, 20, 30, 40]),
+            ("b2", &vec![2, 4, 6, 6, 8]),
+            ("c2", &vec![50, 60, 70, 80, 90]),
+        );
+        let on = vec![(
+            Column::new_with_schema("b1", &left.schema())?,
+            Column::new_with_schema("b2", &right.schema())?,
+        )];
+
+        let (_, batches) = join_collect(left, right, on, JoinType::Left).await?;
+        let expected = vec![
+            "+----+----+----+----+----+----+",
+            "| a1 | b1 | c1 | a2 | b2 | c2 |",
+            "+----+----+----+----+----+----+",
+            "| 0  | 3  | 4  |    |    |    |",
+            "| 1  | 4  | 5  | 10 | 4  | 60 |",
+            "| 2  | 5  | 6  |    |    |    |",
+            "| 3  | 6  | 7  | 20 | 6  | 70 |",
+            "| 3  | 6  | 7  | 30 | 6  | 80 |",
+            "| 4  | 6  | 8  | 20 | 6  | 70 |",
+            "| 4  | 6  | 8  | 30 | 6  | 80 |",
+            "| 5  | 7  | 9  |    |    |    |",
+            "+----+----+----+----+----+----+",
+        ];
+        assert_batches_eq!(expected, &batches);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_right_sort_order() -> Result<()> {
+        let left = build_table(
+            ("a1", &vec![0, 1, 2, 3]),
+            ("b1", &vec![3, 4, 5, 7]),
+            ("c1", &vec![6, 7, 8, 9]),
+        );
+        let right = build_table(
+            ("a2", &vec![0, 10, 20, 30]),
+            ("b2", &vec![2, 4, 5, 6]),
+            ("c2", &vec![60, 70, 80, 90]),
+        );
+        let on = vec![(
+            Column::new_with_schema("b1", &left.schema())?,
+            Column::new_with_schema("b2", &right.schema())?,
+        )];
+
+        let (_, batches) = join_collect(left, right, on, JoinType::Right).await?;
+        let expected = vec![
+            "+----+----+----+----+----+----+",
+            "| a1 | b1 | c1 | a2 | b2 | c2 |",
+            "+----+----+----+----+----+----+",
+            "|    |    |    | 0  | 2  | 60 |",
+            "| 1  | 4  | 7  | 10 | 4  | 70 |",
+            "| 2  | 5  | 8  | 20 | 5  | 80 |",
+            "|    |    |    | 30 | 6  | 90 |",
+            "+----+----+----+----+----+----+",
+        ];
+        assert_batches_eq!(expected, &batches);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn join_left_multiple_batches() -> Result<()> {

Review Comment:
   👍 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org