You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2022/06/02 10:58:16 UTC
[arrow] branch master updated: ARROW-16685: [Python] Preserve order of columns in joins (#13281)
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 9618451b1a ARROW-16685: [Python] Preserve order of columns in joins (#13281)
9618451b1a is described below
commit 9618451b1ab2e2c79bcfd320dd47487bf62daac7
Author: Alessandro Molina <am...@turbogears.org>
AuthorDate: Thu Jun 2 12:58:10 2022 +0200
ARROW-16685: [Python] Preserve order of columns in joins (#13281)
Authored-by: Alessandro Molina <am...@turbogears.org>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
python/pyarrow/_exec_plan.pyx | 12 +++++++++---
python/pyarrow/tests/test_table.py | 39 ++++++++++++++++++++++++++++++++++++++
2 files changed, 48 insertions(+), 3 deletions(-)
diff --git a/python/pyarrow/_exec_plan.pyx b/python/pyarrow/_exec_plan.pyx
index 753abe27cf..89e474f439 100644
--- a/python/pyarrow/_exec_plan.pyx
+++ b/python/pyarrow/_exec_plan.pyx
@@ -259,13 +259,19 @@ def _perform_join(join_type, left_operand not None, left_keys,
left_columns = []
elif join_type == "inner":
c_join_type = CJoinType_INNER
- right_columns = set(right_columns) - set(right_keys)
+ right_columns = [
+ col for col in right_columns if col not in right_keys_order
+ ]
elif join_type == "left outer":
c_join_type = CJoinType_LEFT_OUTER
- right_columns = set(right_columns) - set(right_keys)
+ right_columns = [
+ col for col in right_columns if col not in right_keys_order
+ ]
elif join_type == "right outer":
c_join_type = CJoinType_RIGHT_OUTER
- left_columns = set(left_columns) - set(left_keys)
+ left_columns = [
+ col for col in left_columns if col not in left_keys_order
+ ]
elif join_type == "full outer":
c_join_type = CJoinType_FULL_OUTER
else:
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
index 81a8b27f4d..6474974b4f 100644
--- a/python/pyarrow/tests/test_table.py
+++ b/python/pyarrow/tests/test_table.py
@@ -2145,3 +2145,42 @@ def test_table_filter_expression():
"colB": [10, 20, 60, 20, 10],
"colVals": ["a", "b", "f", "B", "A"]
})
+
+
+@pytest.mark.dataset
+def test_table_join_many_columns():
+ t1 = pa.table({
+ "colA": [1, 2, 6],
+ "col2": ["a", "b", "f"]
+ })
+
+ t2 = pa.table({
+ "colB": [99, 2, 1],
+ "col3": ["Z", "B", "A"],
+ "col4": ["Z", "B", "A"],
+ "col5": ["Z", "B", "A"],
+ "col6": ["Z", "B", "A"],
+ "col7": ["Z", "B", "A"]
+ })
+
+ result = t1.join(t2, "colA", "colB")
+ assert result.combine_chunks() == pa.table({
+ "colA": [1, 2, 6],
+ "col2": ["a", "b", "f"],
+ "col3": ["A", "B", None],
+ "col4": ["A", "B", None],
+ "col5": ["A", "B", None],
+ "col6": ["A", "B", None],
+ "col7": ["A", "B", None]
+ })
+
+ result = t1.join(t2, "colA", "colB", join_type="full outer")
+ assert result.combine_chunks().sort_by("colA") == pa.table({
+ "colA": [1, 2, 6, 99],
+ "col2": ["a", "b", "f", None],
+ "col3": ["A", "B", None, "Z"],
+ "col4": ["A", "B", None, "Z"],
+ "col5": ["A", "B", None, "Z"],
+ "col6": ["A", "B", None, "Z"],
+ "col7": ["A", "B", None, "Z"],
+ })