You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2023/06/07 12:02:19 UTC

[arrow] branch main updated: GH-33980: [Docs][Python] Document DataFrame Interchange Protocol implementation and usage (#35835)

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 1d75816204 GH-33980: [Docs][Python] Document DataFrame Interchange Protocol implementation and usage (#35835)
1d75816204 is described below

commit 1d758162044f806cd10d2e7a0953ea31f48a8594
Author: Alenka Frim <Al...@users.noreply.github.com>
AuthorDate: Wed Jun 7 14:02:08 2023 +0200

    GH-33980: [Docs][Python] Document DataFrame Interchange Protocol implementation and usage (#35835)
    
    _edit: just added something_
    * Closes: #33980
    
    Lead-authored-by: AlenkaF <fr...@gmail.com>
    Co-authored-by: Alenka Frim <Al...@users.noreply.github.com>
    Co-authored-by: Joris Van den Bossche <jo...@gmail.com>
    Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
 docs/source/conf.py                          |   1 +
 docs/source/python/api/tables.rst            |   8 ++
 docs/source/python/index.rst                 |   1 +
 docs/source/python/interchange_protocol.rst  | 119 +++++++++++++++++++++++++++
 python/pyarrow/interchange/from_dataframe.py |  25 ++++++
 5 files changed, 154 insertions(+)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 19b0c353bd..8a05641525 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -79,6 +79,7 @@ extensions = [
 # Show members for classes in .. autosummary
 autodoc_default_options = {
     'members': None,
+    'special-members': '__dataframe__',
     'undoc-members': None,
     'show-inheritance': None,
     'inherited-members': None
diff --git a/docs/source/python/api/tables.rst b/docs/source/python/api/tables.rst
index eadf40cb75..ae9f5de127 100644
--- a/docs/source/python/api/tables.rst
+++ b/docs/source/python/api/tables.rst
@@ -46,6 +46,14 @@ Classes
    TableGroupBy
    RecordBatchReader
 
+Dataframe Interchange Protocol
+------------------------------
+
+.. autosummary::
+   :toctree: ../generated/
+
+   interchange.from_dataframe
+
 .. _api.tensor:
 
 Tensors
diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst
index 77cfaef4a4..b80cbc7de5 100644
--- a/docs/source/python/index.rst
+++ b/docs/source/python/index.rst
@@ -47,6 +47,7 @@ files into Arrow structures.
    filesystems_deprecated
    numpy
    pandas
+   interchange_protocol
    timestamps
    orc
    csv
diff --git a/docs/source/python/interchange_protocol.rst b/docs/source/python/interchange_protocol.rst
new file mode 100644
index 0000000000..7784d78619
--- /dev/null
+++ b/docs/source/python/interchange_protocol.rst
@@ -0,0 +1,119 @@
+.. Licensed to the Apache Software Foundation (ASF) under one
+.. or more contributor license agreements.  See the NOTICE file
+.. distributed with this work for additional information
+.. regarding copyright ownership.  The ASF licenses this file
+.. to you under the Apache License, Version 2.0 (the
+.. "License"); you may not use this file except in compliance
+.. with the License.  You may obtain a copy of the License at
+
+..   http://www.apache.org/licenses/LICENSE-2.0
+
+.. Unless required by applicable law or agreed to in writing,
+.. software distributed under the License is distributed on an
+.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+.. KIND, either express or implied.  See the License for the
+.. specific language governing permissions and limitations
+.. under the License.
+
+Dataframe Interchange Protocol
+==============================
+
+The interchange protocol is implemented for ``pa.Table`` and
+``pa.RecordBatch`` and is used to interchange data between
+PyArrow and other dataframe libraries that also have the
+protocol implemented. The data structures that are supported
+in the protocol are primitive data types plus the dictionary
+data type. The protocol also has missing data support and
+it supports chunking, meaning accessing the
+data in “batches” of rows.
+
+
+The Python dataframe interchange protocol is designed by the
+`Consortium for Python Data API Standards <https://data-apis.org/>`_
+in order to enable data interchange between dataframe
+libraries in the Python ecosystem. See more about the
+standard in the
+`protocol documentation <https://data-apis.org/dataframe-protocol/latest/index.html>`_.
+
+From pyarrow to other libraries: ``__dataframe__()`` method
+-----------------------------------------------------------
+
+The ``__dataframe__()`` method creates a new exchange object that
+the consumer library can take and construct an object of it's own.
+
+.. code-block::
+
+    >>> import pyarrow as pa
+    >>> table = pa.table({"n_atendees": [100, 10, 1]})
+    >>> table.__dataframe__()
+    <pyarrow.interchange.dataframe._PyArrowDataFrame object at ...>
+
+This is meant to be used by the consumer library when calling
+the ``from_dataframe()`` function and is not meant to be used manually
+by the user.
+
+From other libraries to pyarrow: ``from_dataframe()``
+-----------------------------------------------------
+
+With the ``from_dataframe()`` function, we can construct a :class:`pyarrow.Table`
+from any dataframe object that implements the
+``__dataframe__()`` method via the dataframe interchange
+protocol.
+
+We can for example take a pandas dataframe and construct a
+pyarrow table with the use of the interchange protocol:
+
+.. code-block::
+
+    >>> import pyarrow
+    >>> from pyarrow.interchange import from_dataframe
+
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({
+    ...         "n_atendees": [100, 10, 1],
+    ...         "country": ["Italy", "Spain", "Slovenia"],
+    ...     })
+    >>> df
+       n_atendees   country
+    0         100     Italy
+    1          10     Spain
+    2           1  Slovenia
+    >>> from_dataframe(df)
+    pyarrow.Table
+    n_atendees: int64
+    country: large_string
+    ----
+    n_atendees: [[100,10,1]]
+    country: [["Italy","Spain","Slovenia"]]
+
+We can do the same with a polars dataframe:
+
+.. code-block::
+
+    >>> import polars as pl
+    >>> from datetime import datetime
+    >>> arr = [datetime(2023, 5, 20, 10, 0),
+    ...        datetime(2023, 5, 20, 11, 0),
+    ...        datetime(2023, 5, 20, 13, 30)]
+    >>> df = pl.DataFrame({
+    ...          'Talk': ['About Polars','Intro into PyArrow','Coding in Rust'],
+    ...          'Time': arr,
+    ...      })
+    >>> df
+    shape: (3, 2)
+    ┌────────────────────┬─────────────────────┐
+    │ Talk               ┆ Time                │
+    │ ---                ┆ ---                 │
+    │ str                ┆ datetime[μs]        │
+    ╞════════════════════╪═════════════════════╡
+    │ About Polars       ┆ 2023-05-20 10:00:00 │
+    │ Intro into PyArrow ┆ 2023-05-20 11:00:00 │
+    │ Coding in Rust     ┆ 2023-05-20 13:30:00 │
+    └────────────────────┴─────────────────────┘
+    >>> from_dataframe(df)
+    pyarrow.Table
+    Talk: large_string
+    Time: timestamp[us]
+    ----
+    Talk: [["About Polars","Intro into PyArrow","Coding in Rust"]]
+    Time: [[2023-05-20 10:00:00.000000,2023-05-20 11:00:00.000000,2023-05-20 13:30:00.000000]]
diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
index 801d0dd452..1d41aa8d7e 100644
--- a/python/pyarrow/interchange/from_dataframe.py
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -74,6 +74,31 @@ def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table:
     Returns
     -------
     pa.Table
+
+    Examples
+    --------
+    >>> import pyarrow
+    >>> from pyarrow.interchange import from_dataframe
+
+    Convert a pandas dataframe to a pyarrow table:
+
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({
+    ...         "n_atendees": [100, 10, 1],
+    ...         "country": ["Italy", "Spain", "Slovenia"],
+    ...     })
+    >>> df
+       n_atendees   country
+    0         100     Italy
+    1          10     Spain
+    2           1  Slovenia
+    >>> from_dataframe(df)
+    pyarrow.Table
+    n_atendees: int64
+    country: large_string
+    ----
+    n_atendees: [[100,10,1]]
+    country: [["Italy","Spain","Slovenia"]]
     """
     if isinstance(df, pa.Table):
         return df