You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by am...@apache.org on 2022/04/20 15:24:36 UTC
[arrow] branch master updated: ARROW-16074: [Docs] Document joins
This is an automated email from the ASF dual-hosted git repository.
amolina pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 1f43abc933 ARROW-16074: [Docs] Document joins
1f43abc933 is described below
commit 1f43abc93318ce0bd7b3650ebbd29372809970be
Author: Alessandro Molina <am...@turbogears.org>
AuthorDate: Wed Apr 20 17:24:27 2022 +0200
ARROW-16074: [Docs] Document joins
Closes #12936 from amol-/ARROW-16074
Authored-by: Alessandro Molina <am...@turbogears.org>
Signed-off-by: Alessandro Molina <am...@turbogears.org>
---
docs/source/python/api/dataset.rst | 1 +
docs/source/python/compute.rst | 129 ++++++++++++++++++++++++++++++++++++-
python/pyarrow/_dataset.pyx | 96 +++++++++++++--------------
3 files changed, 177 insertions(+), 49 deletions(-)
diff --git a/docs/source/python/api/dataset.rst b/docs/source/python/api/dataset.rst
index ab00adf55c..fdf41b15f9 100644
--- a/docs/source/python/api/dataset.rst
+++ b/docs/source/python/api/dataset.rst
@@ -69,3 +69,4 @@ Classes
TaggedRecordBatch
Scanner
Expression
+ InMemoryDataset
diff --git a/docs/source/python/compute.rst b/docs/source/python/compute.rst
index f6fc37a379..6099d50219 100644
--- a/docs/source/python/compute.rst
+++ b/docs/source/python/compute.rst
@@ -165,4 +165,131 @@ Following is a list of all supported grouped aggregation functions.
You can use them with or without the ``"hash_"`` prefix.
.. arrow-computefuncs::
- :kind: hash_aggregate
\ No newline at end of file
+ :kind: hash_aggregate
+
+.. _py-joins:
+
+Table and Dataset Joins
+=======================
+
+Both :class:`.Table` and :class:`.Dataset` support
+join operations through :meth:`.Table.join`
+and :meth:`.Dataset.join` methods.
+
+The methods accept a right table or dataset that will
+be joined to the initial one and one or more keys that
+should be used from the two entities to perform the join.
+
+By default a ``left outer join`` is performed, but it's possible
+to ask for any of the supported join types:
+
+* left semi
+* right semi
+* left anti
+* right anti
+* inner
+* left outer
+* right outer
+* full outer
+
+A basic join can be performed just by providing a table and a key
+on which the join should be performed:
+
+.. code-block:: python
+
+ import pyarrow as pa
+
+ table1 = pa.table({'id': [1, 2, 3],
+ 'year': [2020, 2022, 2019]})
+
+ table2 = pa.table({'id': [3, 4],
+ 'n_legs': [5, 100],
+ 'animal': ["Brittle stars", "Centipede"]})
+
+ joined_table = table1.join(table2, keys="id")
+
+The result will be a new table created by joining ``table1`` with
+``table2`` on the ``id`` key with a ``left outer join``::
+
+ pyarrow.Table
+ id: int64
+ year: int64
+ n_legs: int64
+ animal: string
+ ----
+ id: [[3,1,2]]
+ year: [[2019,2020,2022]]
+ n_legs: [[5,null,null]]
+ animal: [["Brittle stars",null,null]]
+
+We can perform additional type of joins, like ``full outer join`` by
+passing them to the ``join_type`` argument:
+
+.. code-block:: python
+
+ table1.join(table2, keys='id', join_type="full outer")
+
+In that case the result would be::
+
+ pyarrow.Table
+ id: int64
+ year: int64
+ n_legs: int64
+ animal: string
+ ----
+ id: [[3,1,2],[4]]
+ year: [[2019,2020,2022],[null]]
+ n_legs: [[5,null,null],[100]]
+ animal: [["Brittle stars",null,null],["Centipede"]]
+
+It's also possible to provide additional join keys, so that the
+join happens on two keys instead of one. For example we can add
+an ``year`` column to ``table2`` so that we can join on ``('id', 'year')``:
+
+.. code-block::
+
+ table2_withyear = table2.append_column("year", pa.array([2019, 2022]))
+ table1.join(table2_withyear, keys=["id", "year"])
+
+The result will be a table where only entries with ``id=3`` and ``year=2019``
+have data, the rest will be ``null``::
+
+ pyarrow.Table
+ id: int64
+ year: int64
+ animal: string
+ n_legs: int64
+ ----
+ id: [[3,1,2]]
+ year: [[2019,2020,2022]]
+ animal: [["Brittle stars",null,null]]
+ n_legs: [[5,null,null]]
+
+The same capabilities are available for :meth:`.Dataset.join` too, so you can
+take two datasets and join them:
+
+.. code-block::
+
+ import pyarrow.dataset as ds
+
+ ds1 = ds.dataset(table1)
+ ds2 = ds.dataset(table2)
+
+ joined_ds = ds1.join(ds2, key="id")
+
+The resulting dataset will be an :class:`.InMemoryDataset` containing the joined data::
+
+ >>> joined_ds.head(5)
+
+ pyarrow.Table
+ id: int64
+ year: int64
+ animal: string
+ n_legs: int64
+ ----
+ id: [[3,1,2]]
+ year: [[2019,2020,2022]]
+ animal: [["Brittle stars",null,null]]
+ n_legs: [[5,null,null]]
+
+
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
index 1280e5aa8c..3a4d3ad4c3 100644
--- a/python/pyarrow/_dataset.pyx
+++ b/python/pyarrow/_dataset.pyx
@@ -356,6 +356,54 @@ cdef class Dataset(_Weakrefable):
"""The common schema of the full Dataset"""
return pyarrow_wrap_schema(self.dataset.schema())
+ def join(self, right_dataset, keys, right_keys=None, join_type="left outer",
+ left_suffix=None, right_suffix=None, coalesce_keys=True,
+ use_threads=True):
+ """
+ Perform a join between this dataset and another one.
+
+ Result of the join will be a new dataset, where further
+ operations can be applied.
+
+ Parameters
+ ----------
+ right_dataset : dataset
+ The dataset to join to the current one, acting as the right dataset
+ in the join operation.
+ keys : str or list[str]
+ The columns from current dataset that should be used as keys
+ of the join operation left side.
+ right_keys : str or list[str], default None
+ The columns from the right_dataset that should be used as keys
+ on the join operation right side.
+ When ``None`` use the same key names as the left dataset.
+ join_type : str, default "left outer"
+ The kind of join that should be performed, one of
+ ("left semi", "right semi", "left anti", "right anti",
+ "inner", "left outer", "right outer", "full outer")
+ left_suffix : str, default None
+ Which suffix to add to right column names. This prevents confusion
+ when the columns in left and right datasets have colliding names.
+ right_suffix : str, default None
+ Which suffic to add to the left column names. This prevents confusion
+ when the columns in left and right datasets have colliding names.
+ coalesce_keys : bool, default True
+ If the duplicated keys should be omitted from one of the sides
+ in the join result.
+ use_threads : bool, default True
+ Whenever to use multithreading or not.
+
+ Returns
+ -------
+ InMemoryDataset
+ """
+ if right_keys is None:
+ right_keys = keys
+ return _pc()._exec_plan._perform_join(join_type, self, keys, right_dataset, right_keys,
+ left_suffix=left_suffix, right_suffix=right_suffix,
+ use_threads=use_threads, coalesce_keys=coalesce_keys,
+ output_type=InMemoryDataset)
+
cdef class InMemoryDataset(Dataset):
"""
@@ -623,54 +671,6 @@ cdef class FileSystemDataset(Dataset):
"""The FileFormat of this source."""
return FileFormat.wrap(self.filesystem_dataset.format())
- def join(self, right_dataset, keys, right_keys=None, join_type="left outer",
- left_suffix=None, right_suffix=None, coalesce_keys=True,
- use_threads=True):
- """
- Perform a join between this dataset and another one.
-
- Result of the join will be a new dataset, where further
- operations can be applied.
-
- Parameters
- ----------
- right_dataset : dataset
- The dataset to join to the current one, acting as the right dataset
- in the join operation.
- keys : str or list[str]
- The columns from current dataset that should be used as keys
- of the join operation left side.
- right_keys : str or list[str], default None
- The columns from the right_dataset that should be used as keys
- on the join operation right side.
- When ``None`` use the same key names as the left dataset.
- join_type : str, default "left outer"
- The kind of join that should be performed, one of
- ("left semi", "right semi", "left anti", "right anti",
- "inner", "left outer", "right outer", "full outer")
- left_suffix : str, default None
- Which suffix to add to right column names. This prevents confusion
- when the columns in left and right datasets have colliding names.
- right_suffix : str, default None
- Which suffic to add to the left column names. This prevents confusion
- when the columns in left and right datasets have colliding names.
- coalesce_keys : bool, default True
- If the duplicated keys should be omitted from one of the sides
- in the join result.
- use_threads : bool, default True
- Whenever to use multithreading or not.
-
- Returns
- -------
- InMemoryDataset
- """
- if right_keys is None:
- right_keys = keys
- return _pc()._exec_plan._perform_join(join_type, self, keys, right_dataset, right_keys,
- left_suffix=left_suffix, right_suffix=right_suffix,
- use_threads=use_threads, coalesce_keys=coalesce_keys,
- output_type=InMemoryDataset)
-
cdef class FileWriteOptions(_Weakrefable):