You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/06/02 10:58:16 UTC

[arrow] branch master updated: ARROW-16685: [Python] Preserve order of columns in joins (#13281)

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 9618451b1a ARROW-16685: [Python] Preserve order of columns in joins (#13281)
9618451b1a is described below

commit 9618451b1ab2e2c79bcfd320dd47487bf62daac7
Author: Alessandro Molina <am...@turbogears.org>
AuthorDate: Thu Jun 2 12:58:10 2022 +0200

    ARROW-16685: [Python] Preserve order of columns in joins (#13281)
    
    Authored-by: Alessandro Molina <am...@turbogears.org>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 python/pyarrow/_exec_plan.pyx      | 12 +++++++++---
 python/pyarrow/tests/test_table.py | 39 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/python/pyarrow/_exec_plan.pyx b/python/pyarrow/_exec_plan.pyx
index 753abe27cf..89e474f439 100644
--- a/python/pyarrow/_exec_plan.pyx
+++ b/python/pyarrow/_exec_plan.pyx
@@ -259,13 +259,19 @@ def _perform_join(join_type, left_operand not None, left_keys,
         left_columns = []
     elif join_type == "inner":
         c_join_type = CJoinType_INNER
-        right_columns = set(right_columns) - set(right_keys)
+        right_columns = [
+            col for col in right_columns if col not in right_keys_order
+        ]
     elif join_type == "left outer":
         c_join_type = CJoinType_LEFT_OUTER
-        right_columns = set(right_columns) - set(right_keys)
+        right_columns = [
+            col for col in right_columns if col not in right_keys_order
+        ]
     elif join_type == "right outer":
         c_join_type = CJoinType_RIGHT_OUTER
-        left_columns = set(left_columns) - set(left_keys)
+        left_columns = [
+            col for col in left_columns if col not in left_keys_order
+        ]
     elif join_type == "full outer":
         c_join_type = CJoinType_FULL_OUTER
     else:
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 81a8b27f4d..6474974b4f 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -2145,3 +2145,42 @@ def test_table_filter_expression():
         "colB": [10, 20, 60, 20, 10],
         "colVals": ["a", "b", "f", "B", "A"]
     })
+
+
+@pytest.mark.dataset
+def test_table_join_many_columns():
+    t1 = pa.table({
+        "colA": [1, 2, 6],
+        "col2": ["a", "b", "f"]
+    })
+
+    t2 = pa.table({
+        "colB": [99, 2, 1],
+        "col3": ["Z", "B", "A"],
+        "col4": ["Z", "B", "A"],
+        "col5": ["Z", "B", "A"],
+        "col6": ["Z", "B", "A"],
+        "col7": ["Z", "B", "A"]
+    })
+
+    result = t1.join(t2, "colA", "colB")
+    assert result.combine_chunks() == pa.table({
+        "colA": [1, 2, 6],
+        "col2": ["a", "b", "f"],
+        "col3": ["A", "B", None],
+        "col4": ["A", "B", None],
+        "col5": ["A", "B", None],
+        "col6": ["A", "B", None],
+        "col7": ["A", "B", None]
+    })
+
+    result = t1.join(t2, "colA", "colB", join_type="full outer")
+    assert result.combine_chunks().sort_by("colA") == pa.table({
+        "colA": [1, 2, 6, 99],
+        "col2": ["a", "b", "f", None],
+        "col3": ["A", "B", None, "Z"],
+        "col4": ["A", "B", None, "Z"],
+        "col5": ["A", "B", None, "Z"],
+        "col6": ["A", "B", None, "Z"],
+        "col7": ["A", "B", None, "Z"],
+    })