You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2023/01/13 10:10:53 UTC
[arrow] branch master updated: GH-33346: [Python] DataFrame Interchange Protocol for pyarrow Table (#14804)
This is an automated email from the ASF dual-hosted git repository.
jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new a83cc852ad GH-33346: [Python] DataFrame Interchange Protocol for pyarrow Table (#14804)
a83cc852ad is described below
commit a83cc852addd4d6946ead700b0f8c95b6b14f406
Author: Alenka Frim <Al...@users.noreply.github.com>
AuthorDate: Fri Jan 13 11:10:42 2023 +0100
GH-33346: [Python] DataFrame Interchange Protocol for pyarrow Table (#14804)
This PR implements the Dataframe Interchange Protocol for `pyarrow.Table`.
See: https://data-apis.org/dataframe-protocol/latest/index.html
Lead-authored-by: Alenka Frim <fr...@gmail.com>
Co-authored-by: Alenka Frim <Al...@users.noreply.github.com>
Co-authored-by: Joris Van den Bossche <jo...@gmail.com>
Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
python/pyarrow/interchange/__init__.py | 20 +
python/pyarrow/interchange/buffer.py | 107 ++++
python/pyarrow/interchange/column.py | 527 +++++++++++++++++++
python/pyarrow/interchange/dataframe.py | 202 ++++++++
python/pyarrow/interchange/from_dataframe.py | 567 +++++++++++++++++++++
python/pyarrow/table.pxi | 30 ++
python/pyarrow/tests/interchange/__init__.py | 16 +
.../pyarrow/tests/interchange/test_conversion.py | 524 +++++++++++++++++++
.../tests/interchange/test_interchange_spec.py | 243 +++++++++
9 files changed, 2236 insertions(+)
diff --git a/python/pyarrow/interchange/__init__.py b/python/pyarrow/interchange/__init__.py
new file mode 100644
index 0000000000..7ebe59b499
--- /dev/null
+++ b/python/pyarrow/interchange/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# flake8: noqa
+
+from .from_dataframe import from_dataframe
diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py
new file mode 100644
index 0000000000..1f53779813
--- /dev/null
+++ b/python/pyarrow/interchange/buffer.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+import enum
+
+import pyarrow as pa
+
+
+class DlpackDeviceType(enum.IntEnum):
+ """Integer enum for device type codes matching DLPack."""
+
+ CPU = 1
+ CUDA = 2
+ CPU_PINNED = 3
+ OPENCL = 4
+ VULKAN = 7
+ METAL = 8
+ VPI = 9
+ ROCM = 10
+
+
+class _PyArrowBuffer:
+ """
+ Data in the buffer is guaranteed to be contiguous in memory.
+
+ Note that there is no dtype attribute present, a buffer can be thought of
+ as simply a block of memory. However, if the column that the buffer is
+ attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
+ implemented, then that dtype information will be contained in the return
+ value from ``__dlpack__``.
+
+ This distinction is useful to support both data exchange via DLPack on a
+ buffer and (b) dtypes like variable-length strings which do not have a
+ fixed number of bytes per element.
+ """
+
+ def __init__(self, x: pa.Buffer, allow_copy: bool = True) -> None:
+ """
+ Handle PyArrow Buffers.
+ """
+ self._x = x
+
+ @property
+ def bufsize(self) -> int:
+ """
+ Buffer size in bytes.
+ """
+ return self._x.size
+
+ @property
+ def ptr(self) -> int:
+ """
+ Pointer to start of the buffer as an integer.
+ """
+ return self._x.address
+
+ def __dlpack__(self):
+ """
+ Produce DLPack capsule (see array API standard).
+
+ Raises:
+ - TypeError : if the buffer contains unsupported dtypes.
+ - NotImplementedError : if DLPack support is not implemented
+
+ Useful to have to connect to array libraries. Support optional because
+ it's not completely trivial to implement for a Python-only library.
+ """
+ raise NotImplementedError("__dlpack__")
+
+ def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
+ """
+ Device type and device ID for where the data in the buffer resides.
+ Uses device type codes matching DLPack.
+ Note: must be implemented even if ``__dlpack__`` is not.
+ """
+ if self._x.is_cpu:
+ return (DlpackDeviceType.CPU, None)
+ else:
+ raise NotImplementedError("__dlpack_device__")
+
+ def __repr__(self) -> str:
+ return (
+ "PyArrowBuffer(" +
+ str(
+ {
+ "bufsize": self.bufsize,
+ "ptr": self.ptr,
+ "device": self.__dlpack_device__()[0].name,
+ }
+ ) +
+ ")"
+ )
diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
new file mode 100644
index 0000000000..a9b8958616
--- /dev/null
+++ b/python/pyarrow/interchange/column.py
@@ -0,0 +1,527 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+
+import enum
+from typing import (
+ Any,
+ Dict,
+ Iterable,
+ Optional,
+ Tuple,
+)
+
+import sys
+if sys.version_info >= (3, 8):
+ from typing import TypedDict
+else:
+ from typing_extensions import TypedDict
+
+import pyarrow as pa
+import pyarrow.compute as pc
+from pyarrow.interchange.buffer import _PyArrowBuffer
+
+
+class DtypeKind(enum.IntEnum):
+ """
+ Integer enum for data types.
+
+ Attributes
+ ----------
+ INT : int
+ Matches to signed integer data type.
+ UINT : int
+ Matches to unsigned integer data type.
+ FLOAT : int
+ Matches to floating point data type.
+ BOOL : int
+ Matches to boolean data type.
+ STRING : int
+ Matches to string data type (UTF-8 encoded).
+ DATETIME : int
+ Matches to datetime data type.
+ CATEGORICAL : int
+ Matches to categorical data type.
+ """
+
+ INT = 0
+ UINT = 1
+ FLOAT = 2
+ BOOL = 20
+ STRING = 21 # UTF-8
+ DATETIME = 22
+ CATEGORICAL = 23
+
+
+Dtype = Tuple[DtypeKind, int, str, str] # see Column.dtype
+
+
+_PYARROW_KINDS = {
+ pa.int8(): (DtypeKind.INT, "c"),
+ pa.int16(): (DtypeKind.INT, "s"),
+ pa.int32(): (DtypeKind.INT, "i"),
+ pa.int64(): (DtypeKind.INT, "l"),
+ pa.uint8(): (DtypeKind.UINT, "C"),
+ pa.uint16(): (DtypeKind.UINT, "S"),
+ pa.uint32(): (DtypeKind.UINT, "I"),
+ pa.uint64(): (DtypeKind.UINT, "L"),
+ pa.float16(): (DtypeKind.FLOAT, "e"),
+ pa.float32(): (DtypeKind.FLOAT, "f"),
+ pa.float64(): (DtypeKind.FLOAT, "g"),
+ pa.bool_(): (DtypeKind.BOOL, "b"),
+ pa.string(): (DtypeKind.STRING, "u"),
+ pa.large_string(): (DtypeKind.STRING, "U"),
+}
+
+
+class ColumnNullType(enum.IntEnum):
+ """
+ Integer enum for null type representation.
+
+ Attributes
+ ----------
+ NON_NULLABLE : int
+ Non-nullable column.
+ USE_NAN : int
+ Use explicit float NaN value.
+ USE_SENTINEL : int
+ Sentinel value besides NaN.
+ USE_BITMASK : int
+ The bit is set/unset representing a null on a certain position.
+ USE_BYTEMASK : int
+ The byte is set/unset representing a null on a certain position.
+ """
+
+ NON_NULLABLE = 0
+ USE_NAN = 1
+ USE_SENTINEL = 2
+ USE_BITMASK = 3
+ USE_BYTEMASK = 4
+
+
+class ColumnBuffers(TypedDict):
+ # first element is a buffer containing the column data;
+ # second element is the data buffer's associated dtype
+ data: Tuple[_PyArrowBuffer, Dtype]
+
+ # first element is a buffer containing mask values indicating missing data;
+ # second element is the mask value buffer's associated dtype.
+ # None if the null representation is not a bit or byte mask
+ validity: Optional[Tuple[_PyArrowBuffer, Dtype]]
+
+ # first element is a buffer containing the offset values for
+ # variable-size binary data (e.g., variable-length strings);
+ # second element is the offsets buffer's associated dtype.
+ # None if the data buffer does not have an associated offsets buffer
+ offsets: Optional[Tuple[_PyArrowBuffer, Dtype]]
+
+
+class CategoricalDescription(TypedDict):
+ # whether the ordering of dictionary indices is semantically meaningful
+ is_ordered: bool
+ # whether a dictionary-style mapping of categorical values to other objects
+ # exists
+ is_dictionary: bool
+ # Python-level only (e.g. ``{int: str}``).
+ # None if not a dictionary-style categorical.
+ categories: Optional[_PyArrowColumn]
+
+
+class Endianness:
+ """Enum indicating the byte-order of a data-type."""
+
+ LITTLE = "<"
+ BIG = ">"
+ NATIVE = "="
+ NA = "|"
+
+
+class NoBufferPresent(Exception):
+ """Exception to signal that there is no requested buffer."""
+
+
+class _PyArrowColumn:
+ """
+ A column object, with only the methods and properties required by the
+ interchange protocol defined.
+
+ A column can contain one or more chunks. Each chunk can contain up to three
+ buffers - a data buffer, a mask buffer (depending on null representation),
+ and an offsets buffer (if variable-size binary; e.g., variable-length
+ strings).
+
+ TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
+ Instead, it seems to use "children" for both columns with a bit mask,
+ and for nested dtypes. Unclear whether this is elegant or confusing.
+ This design requires checking the null representation explicitly.
+
+ The Arrow design requires checking:
+ 1. the ARROW_FLAG_NULLABLE (for sentinel values)
+ 2. if a column has two children, combined with one of those children
+ having a null dtype.
+
+ Making the mask concept explicit seems useful. One null dtype would
+ not be enough to cover both bit and byte masks, so that would mean
+ even more checking if we did it the Arrow way.
+
+ TBD: there's also the "chunk" concept here, which is implicit in Arrow as
+ multiple buffers per array (= column here). Semantically it may make
+ sense to have both: chunks were meant for example for lazy evaluation
+ of data which doesn't fit in memory, while multiple buffers per column
+ could also come from doing a selection operation on a single
+ contiguous buffer.
+
+ Given these concepts, one would expect chunks to be all of the same
+ size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
+ while multiple buffers could have data-dependent lengths. Not an issue
+ in pandas if one column is backed by a single NumPy array, but in
+ Arrow it seems possible.
+ Are multiple chunks *and* multiple buffers per column necessary for
+ the purposes of this interchange protocol, or must producers either
+ reuse the chunk concept for this or copy the data?
+
+ Note: this Column object can only be produced by ``__dataframe__``, so
+ doesn't need its own version or ``__column__`` protocol.
+ """
+
+ def __init__(
+ self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True
+ ) -> None:
+ """
+ Handles PyArrow Arrays and ChunkedArrays.
+ """
+ # Store the column as a private attribute
+ if isinstance(column, pa.ChunkedArray):
+ if column.num_chunks == 1:
+ column = column.chunk(0)
+ else:
+ if not allow_copy:
+ raise RuntimeError(
+ "Chunks will be combined and a copy is required which "
+ "is forbidden by allow_copy=False"
+ )
+ column = column.combine_chunks()
+
+ self._allow_copy = allow_copy
+
+ if pa.types.is_boolean(column.type):
+ if not allow_copy:
+ raise RuntimeError(
+ "Boolean column will be casted to uint8 and a copy "
+ "is required which is forbidden by allow_copy=False"
+ )
+ self._dtype = self._dtype_from_arrowdtype(column.type, 8)
+ self._col = pc.cast(column, pa.uint8())
+ else:
+ self._col = column
+ dtype = self._col.type
+ try:
+ bit_width = dtype.bit_width
+ except ValueError:
+ # in case of a variable-length strings, considered as array
+ # of bytes (8 bits)
+ bit_width = 8
+ self._dtype = self._dtype_from_arrowdtype(dtype, bit_width)
+
+ def size(self) -> int:
+ """
+ Size of the column, in elements.
+
+ Corresponds to DataFrame.num_rows() if column is a single chunk;
+ equal to size of this current chunk otherwise.
+
+ Is a method rather than a property because it may cause a (potentially
+ expensive) computation for some dataframe implementations.
+ """
+ return len(self._col)
+
+ @property
+ def offset(self) -> int:
+ """
+ Offset of first element.
+
+ May be > 0 if using chunks; for example for a column with N chunks of
+ equal size M (only the last chunk may be shorter),
+ ``offset = n * M``, ``n = 0 .. N-1``.
+ """
+ return self._col.offset
+
+ @property
+ def dtype(self) -> Tuple[DtypeKind, int, str, str]:
+ """
+ Dtype description as a tuple ``(kind, bit-width, format string,
+ endianness)``.
+
+ Bit-width : the number of bits as an integer
+ Format string : data type description format string in Apache Arrow C
+ Data Interface format.
+ Endianness : current only native endianness (``=``) is supported
+
+ Notes:
+ - Kind specifiers are aligned with DLPack where possible (hence the
+ jump to 20, leave enough room for future extension)
+ - Masks must be specified as boolean with either bit width 1 (for
+ bit masks) or 8 (for byte masks).
+ - Dtype width in bits was preferred over bytes
+ - Endianness isn't too useful, but included now in case in the
+ future we need to support non-native endianness
+ - Went with Apache Arrow format strings over NumPy format strings
+ because they're more complete from a dataframe perspective
+ - Format strings are mostly useful for datetime specification, and
+ for categoricals.
+ - For categoricals, the format string describes the type of the
+ categorical in the data buffer. In case of a separate encoding of
+ the categorical (e.g. an integer to string mapping), this can
+ be derived from ``self.describe_categorical``.
+ - Data types not included: complex, Arrow-style null, binary,
+ decimal, and nested (list, struct, map, union) dtypes.
+ """
+ return self._dtype
+
+ def _dtype_from_arrowdtype(
+ self, dtype: pa.DataType, bit_width: int
+ ) -> Tuple[DtypeKind, int, str, str]:
+ """
+ See `self.dtype` for details.
+ """
+ # Note: 'c' (complex) not handled yet (not in array spec v1).
+ # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
+ # not handled datetime and timedelta both map to datetime
+ # (is timedelta handled?)
+
+ if pa.types.is_timestamp(dtype):
+ kind = DtypeKind.DATETIME
+ ts = dtype.unit[0]
+ tz = dtype.tz if dtype.tz else ""
+ f_string = "ts{ts}:{tz}".format(ts=ts, tz=tz)
+ return kind, bit_width, f_string, Endianness.NATIVE
+ elif pa.types.is_dictionary(dtype):
+ kind = DtypeKind.CATEGORICAL
+ f_string = "L"
+ return kind, bit_width, f_string, Endianness.NATIVE
+ else:
+ kind, f_string = _PYARROW_KINDS.get(dtype, (None, None))
+ if kind is None:
+ raise ValueError(
+ f"Data type {dtype} not supported by interchange protocol")
+
+ return kind, bit_width, f_string, Endianness.NATIVE
+
+ @property
+ def describe_categorical(self) -> CategoricalDescription:
+ """
+ If the dtype is categorical, there are two options:
+ - There are only values in the data buffer.
+ - There is a separate non-categorical Column encoding categorical
+ values.
+
+ Raises TypeError if the dtype is not categorical
+
+ Returns the dictionary with description on how to interpret the
+ data buffer:
+ - "is_ordered" : bool, whether the ordering of dictionary indices
+ is semantically meaningful.
+ - "is_dictionary" : bool, whether a mapping of
+ categorical values to other objects exists
+ - "categories" : Column representing the (implicit) mapping of
+ indices to category values (e.g. an array of
+ cat1, cat2, ...). None if not a dictionary-style
+ categorical.
+
+ TBD: are there any other in-memory representations that are needed?
+ """
+ arr = self._col
+ if not pa.types.is_dictionary(arr.type):
+ raise TypeError(
+ "describe_categorical only works on a column with "
+ "categorical dtype!"
+ )
+
+ return {
+ "is_ordered": self._col.type.ordered,
+ "is_dictionary": True,
+ "categories": _PyArrowColumn(arr.dictionary),
+ }
+
+ @property
+ def describe_null(self) -> Tuple[ColumnNullType, Any]:
+ """
+ Return the missing value (or "null") representation the column dtype
+ uses, as a tuple ``(kind, value)``.
+
+ Value : if kind is "sentinel value", the actual value. If kind is a bit
+ mask or a byte mask, the value (0 or 1) indicating a missing value.
+ None otherwise.
+ """
+ # In case of no missing values, we need to set ColumnNullType to
+ # non nullable as in the current __dataframe__ protocol bit/byte masks
+ # can not be None
+ if self.null_count == 0:
+ return ColumnNullType.NON_NULLABLE, None
+ else:
+ return ColumnNullType.USE_BITMASK, 0
+
+ @property
+ def null_count(self) -> int:
+ """
+ Number of null elements, if known.
+
+ Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
+ """
+ arrow_null_count = self._col.null_count
+ n = arrow_null_count if arrow_null_count != -1 else None
+ return n
+
+ @property
+ def metadata(self) -> Dict[str, Any]:
+ """
+ The metadata for the column. See `DataFrame.metadata` for more details.
+ """
+ pass
+
+ def num_chunks(self) -> int:
+ """
+ Return the number of chunks the column consists of.
+ """
+ return 1
+
+ def get_chunks(
+ self, n_chunks: Optional[int] = None
+ ) -> Iterable[_PyArrowColumn]:
+ """
+ Return an iterator yielding the chunks.
+
+ See `DataFrame.get_chunks` for details on ``n_chunks``.
+ """
+ if n_chunks and n_chunks > 1:
+ chunk_size = self.size() // n_chunks
+ if self.size() % n_chunks != 0:
+ chunk_size += 1
+
+ array = self._col
+ i = 0
+ for start in range(0, chunk_size * n_chunks, chunk_size):
+ yield _PyArrowColumn(
+ array.slice(start, chunk_size), self._allow_copy
+ )
+ i += 1
+ else:
+ yield self
+
+ def get_buffers(self) -> ColumnBuffers:
+ """
+ Return a dictionary containing the underlying buffers.
+
+ The returned dictionary has the following contents:
+
+ - "data": a two-element tuple whose first element is a buffer
+ containing the data and whose second element is the data
+ buffer's associated dtype.
+ - "validity": a two-element tuple whose first element is a buffer
+ containing mask values indicating missing data and
+ whose second element is the mask value buffer's
+ associated dtype. None if the null representation is
+ not a bit or byte mask.
+ - "offsets": a two-element tuple whose first element is a buffer
+ containing the offset values for variable-size binary
+ data (e.g., variable-length strings) and whose second
+ element is the offsets buffer's associated dtype. None
+ if the data buffer does not have an associated offsets
+ buffer.
+ """
+ buffers: ColumnBuffers = {
+ "data": self._get_data_buffer(),
+ "validity": None,
+ "offsets": None,
+ }
+
+ try:
+ buffers["validity"] = self._get_validity_buffer()
+ except NoBufferPresent:
+ pass
+
+ try:
+ buffers["offsets"] = self._get_offsets_buffer()
+ except NoBufferPresent:
+ pass
+
+ return buffers
+
+ def _get_data_buffer(
+ self,
+ ) -> Tuple[_PyArrowBuffer, Any]: # Any is for self.dtype tuple
+ """
+ Return the buffer containing the data and the buffer's
+ associated dtype.
+ """
+ array = self._col
+ dtype = self.dtype
+
+ # In case of dictionary arrays, use indices
+ # to define a buffer, codes are transferred through
+ # describe_categorical()
+ if pa.types.is_dictionary(array.type):
+ array = array.indices
+ dtype = _PyArrowColumn(array).dtype
+
+ n = len(array.buffers())
+ if n == 2:
+ return _PyArrowBuffer(array.buffers()[1]), dtype
+ elif n == 3:
+ return _PyArrowBuffer(array.buffers()[2]), dtype
+
+ def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
+ """
+ Return the buffer containing the mask values indicating missing data
+ and the buffer's associated dtype.
+ Raises NoBufferPresent if null representation is not a bit or byte
+ mask.
+ """
+ # Define the dtype of the returned buffer
+ dtype = (DtypeKind.BOOL, 1, "b", Endianness.NATIVE)
+ array = self._col
+ buff = array.buffers()[0]
+ if buff:
+ return _PyArrowBuffer(buff), dtype
+ else:
+ raise NoBufferPresent(
+ "There are no missing values so "
+ "does not have a separate mask")
+
+ def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
+ """
+ Return the buffer containing the offset values for variable-size binary
+ data (e.g., variable-length strings) and the buffer's associated dtype.
+ Raises NoBufferPresent if the data buffer does not have an associated
+ offsets buffer.
+ """
+ array = self._col
+ n = len(array.buffers())
+ if n == 2:
+ raise NoBufferPresent(
+ "This column has a fixed-length dtype so "
+ "it does not have an offsets buffer"
+ )
+ elif n == 3:
+ # Define the dtype of the returned buffer
+ dtype = self._col.type
+ if pa.types.is_large_string(dtype):
+ dtype = (DtypeKind.INT, 64, "l", Endianness.NATIVE)
+ else:
+ dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE)
+ return _PyArrowBuffer(array.buffers()[1]), dtype
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
new file mode 100644
index 0000000000..d0717e02e8
--- /dev/null
+++ b/python/pyarrow/interchange/dataframe.py
@@ -0,0 +1,202 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+from typing import (
+ Any,
+ Iterable,
+ Optional,
+ Sequence,
+)
+
+import pyarrow as pa
+
+from pyarrow.interchange.column import _PyArrowColumn
+
+
+class _PyArrowDataFrame:
+ """
+ A data frame class, with only the methods required by the interchange
+ protocol defined.
+
+ A "data frame" represents an ordered collection of named columns.
+ A column's "name" must be a unique string.
+ Columns may be accessed by name or by position.
+
+ This could be a public data frame class, or an object with the methods and
+ attributes defined on this DataFrame class could be returned from the
+ ``__dataframe__`` method of a public data frame class in a library adhering
+ to the dataframe interchange protocol specification.
+ """
+
+ def __init__(
+ self, df: pa.Table, nan_as_null: bool = False, allow_copy: bool = True
+ ) -> None:
+ """
+ Constructor - an instance of this (private) class is returned from
+ `pa.Table.__dataframe__`.
+ """
+ self._df = df
+ # ``nan_as_null`` is a keyword intended for the consumer to tell the
+ # producer to overwrite null values in the data with ``NaN`` (or
+ # ``NaT``).
+ if nan_as_null is True:
+ raise RuntimeError(
+ "nan_as_null=True currently has no effect, "
+ "use the default nan_as_null=False"
+ )
+ self._nan_as_null = nan_as_null
+ self._allow_copy = allow_copy
+
+ def __dataframe__(
+ self, nan_as_null: bool = False, allow_copy: bool = True
+ ) -> _PyArrowDataFrame:
+ """
+ Construct a new exchange object, potentially changing the parameters.
+ ``nan_as_null`` is a keyword intended for the consumer to tell the
+ producer to overwrite null values in the data with ``NaN``.
+ It is intended for cases where the consumer does not support the bit
+ mask or byte mask that is the producer's native representation.
+ ``allow_copy`` is a keyword that defines whether or not the library is
+ allowed to make a copy of the data. For example, copying data would be
+ necessary if a library supports strided buffers, given that this
+ protocol specifies contiguous buffers.
+ """
+ return _PyArrowDataFrame(self._df, nan_as_null, allow_copy)
+
+ @property
+ def metadata(self) -> dict[str, Any]:
+ """
+ The metadata for the data frame, as a dictionary with string keys. The
+ contents of `metadata` may be anything, they are meant for a library
+ to store information that it needs to, e.g., roundtrip losslessly or
+ for two implementations to share data that is not (yet) part of the
+ interchange protocol specification. For avoiding collisions with other
+ entries, please add name the keys with the name of the library
+ followed by a period and the desired name, e.g, ``pandas.indexcol``.
+ """
+ # The metadata for the data frame, as a dictionary with string keys.
+ # Add schema metadata here (pandas metadata or custom metadata)
+ if self._df.schema.metadata:
+ schema_metadata = {"pyarrow." + k.decode('utf8'): v.decode('utf8')
+ for k, v in self._df.schema.metadata.items()}
+ return schema_metadata
+ else:
+ return {}
+
+ def num_columns(self) -> int:
+ """
+ Return the number of columns in the DataFrame.
+ """
+ return self._df.num_columns
+
+ def num_rows(self) -> int:
+ """
+ Return the number of rows in the DataFrame, if available.
+ """
+ return self._df.num_rows
+
+ def num_chunks(self) -> int:
+ """
+ Return the number of chunks the DataFrame consists of.
+ """
+ # pyarrow.Table can have columns with different number
+ # of chunks so we take the number of chunks that
+ # .to_batches() returns as it takes the min chunk size
+ # of all the columns (to_batches is a zero copy method)
+ batches = self._df.to_batches()
+ return len(batches)
+
+ def column_names(self) -> Iterable[str]:
+ """
+ Return an iterator yielding the column names.
+ """
+ return self._df.column_names
+
+ def get_column(self, i: int) -> _PyArrowColumn:
+ """
+ Return the column at the indicated position.
+ """
+ return _PyArrowColumn(self._df.column(i),
+ allow_copy=self._allow_copy)
+
+ def get_column_by_name(self, name: str) -> _PyArrowColumn:
+ """
+ Return the column whose name is the indicated name.
+ """
+ return _PyArrowColumn(self._df.column(name),
+ allow_copy=self._allow_copy)
+
+ def get_columns(self) -> Iterable[_PyArrowColumn]:
+ """
+ Return an iterator yielding the columns.
+ """
+ return [
+ _PyArrowColumn(col, allow_copy=self._allow_copy)
+ for col in self._df.columns
+ ]
+
+ def select_columns(self, indices: Sequence[int]) -> _PyArrowDataFrame:
+ """
+ Create a new DataFrame by selecting a subset of columns by index.
+ """
+ return _PyArrowDataFrame(
+ self._df.select(list(indices)), self._nan_as_null, self._allow_copy
+ )
+
+ def select_columns_by_name(
+ self, names: Sequence[str]
+ ) -> _PyArrowDataFrame:
+ """
+ Create a new DataFrame by selecting a subset of columns by name.
+ """
+ return _PyArrowDataFrame(
+ self._df.select(list(names)), self._nan_as_null, self._allow_copy
+ )
+
+ def get_chunks(
+ self, n_chunks: Optional[int] = None
+ ) -> Iterable[_PyArrowDataFrame]:
+ """
+ Return an iterator yielding the chunks.
+
+ By default (None), yields the chunks that the data is stored as by the
+ producer. If given, ``n_chunks`` must be a multiple of
+ ``self.num_chunks()``, meaning the producer must subdivide each chunk
+ before yielding it.
+
+ Note that the producer must ensure that all columns are chunked the
+ same way.
+ """
+ if n_chunks and n_chunks > 1:
+ chunk_size = self.num_rows() // n_chunks
+ if self.num_rows() % n_chunks != 0:
+ chunk_size += 1
+ batches = self._df.to_batches(max_chunksize=chunk_size)
+ # In case when the size of the chunk is such that the resulting
+ # list is one less chunk then n_chunks -> append an empty chunk
+ if len(batches) == n_chunks - 1:
+ batches.append(pa.record_batch([[]], schema=self._df.schema))
+ else:
+ batches = self._df.to_batches()
+
+ iterator_tables = [_PyArrowDataFrame(
+ pa.Table.from_batches([batch]), self._nan_as_null, self._allow_copy
+ )
+ for batch in batches
+ ]
+ return iterator_tables
diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
new file mode 100644
index 0000000000..204530a335
--- /dev/null
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -0,0 +1,567 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+
+from typing import (
+ Any,
+)
+
+from pyarrow.interchange.column import (
+ DtypeKind,
+ ColumnBuffers,
+ ColumnNullType,
+)
+
+import pyarrow as pa
+import re
+
+import pyarrow.compute as pc
+from pyarrow.interchange.column import Dtype
+
+
+# A typing protocol could be added later to let Mypy validate code using
+# `from_dataframe` better.
+DataFrameObject = Any
+ColumnObject = Any
+BufferObject = Any
+
+
+_PYARROW_DTYPES: dict[DtypeKind, dict[int, Any]] = {
+ DtypeKind.INT: {8: pa.int8(),
+ 16: pa.int16(),
+ 32: pa.int32(),
+ 64: pa.int64()},
+ DtypeKind.UINT: {8: pa.uint8(),
+ 16: pa.uint16(),
+ 32: pa.uint32(),
+ 64: pa.uint64()},
+ DtypeKind.FLOAT: {16: pa.float16(),
+ 32: pa.float32(),
+ 64: pa.float64()},
+ DtypeKind.BOOL: {8: pa.uint8()},
+ DtypeKind.STRING: {8: pa.string()},
+}
+
+
+def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table:
+ """
+ Build a ``pa.Table`` from any DataFrame supporting the interchange
+ protocol.
+
+ Parameters
+ ----------
+ df : DataFrameObject
+ Object supporting the interchange protocol, i.e. `__dataframe__`
+ method.
+ allow_copy : bool, default: True
+ Whether to allow copying the memory to perform the conversion
+ (if false then zero-copy approach is requested).
+
+ Returns
+ -------
+ pa.Table
+ """
+ if isinstance(df, pa.Table):
+ return df
+
+ if not hasattr(df, "__dataframe__"):
+ raise ValueError("`df` does not support __dataframe__")
+
+ return _from_dataframe(df.__dataframe__(allow_copy=allow_copy),
+ allow_copy=allow_copy)
+
+
+def _from_dataframe(df: DataFrameObject, allow_copy=True):
+ """
+ Build a ``pa.Table`` from the DataFrame interchange object.
+
+ Parameters
+ ----------
+ df : DataFrameObject
+ Object supporting the interchange protocol, i.e. `__dataframe__`
+ method.
+ allow_copy : bool, default: True
+ Whether to allow copying the memory to perform the conversion
+ (if false then zero-copy approach is requested).
+
+ Returns
+ -------
+ pa.Table
+ """
+ batches = []
+ for chunk in df.get_chunks():
+ batch = protocol_df_chunk_to_pyarrow(chunk, allow_copy)
+ batches.append(batch)
+
+ table = pa.Table.from_batches(batches)
+ return table
+
+
+def protocol_df_chunk_to_pyarrow(
+ df: DataFrameObject,
+ allow_copy: bool = True
+) -> pa.RecordBatch:
+ """
+ Convert interchange protocol chunk to ``pa.RecordBatch``.
+
+ Parameters
+ ----------
+ df : DataFrameObject
+ Object supporting the interchange protocol, i.e. `__dataframe__`
+ method.
+ allow_copy : bool, default: True
+ Whether to allow copying the memory to perform the conversion
+ (if false then zero-copy approach is requested).
+
+ Returns
+ -------
+ pa.RecordBatch
+ """
+ # We need a dict of columns here, with each column being a pa.Array
+ columns: dict[str, pa.Array] = {}
+ for name in df.column_names():
+ if not isinstance(name, str):
+ raise ValueError(f"Column {name} is not a string")
+ if name in columns:
+ raise ValueError(f"Column {name} is not unique")
+ col = df.get_column_by_name(name)
+ dtype = col.dtype[0]
+ if dtype in (
+ DtypeKind.INT,
+ DtypeKind.UINT,
+ DtypeKind.FLOAT,
+ DtypeKind.STRING,
+ DtypeKind.DATETIME,
+ ):
+ columns[name] = column_to_array(col, allow_copy)
+ elif dtype == DtypeKind.BOOL:
+ columns[name] = bool_column_to_array(col, allow_copy)
+ elif dtype == DtypeKind.CATEGORICAL:
+ columns[name] = categorical_column_to_dictionary(col, allow_copy)
+ else:
+ raise NotImplementedError(f"Data type {dtype} not handled yet")
+
+ return pa.RecordBatch.from_pydict(columns)
+
+
+def column_to_array(
+ col: ColumnObject,
+ allow_copy: bool = True,
+) -> pa.Array:
+ """
+ Convert a column holding one of the primitive dtypes to a PyArrow array.
+ A primitive type is one of: int, uint, float, bool (1 bit).
+
+ Parameters
+ ----------
+ col : ColumnObject
+ allow_copy : bool, default: True
+ Whether to allow copying the memory to perform the conversion
+ (if false then zero-copy approach is requested).
+
+ Returns
+ -------
+ pa.Array
+ """
+ buffers = col.get_buffers()
+ data = buffers_to_array(buffers, col.size(),
+ col.describe_null,
+ col.offset,
+ allow_copy)
+ return data
+
+
+def bool_column_to_array(
+ col: ColumnObject,
+ allow_copy: bool = True,
+) -> pa.Array:
+ """
+ Convert a column holding boolean dtype to a PyArrow array.
+
+ Parameters
+ ----------
+ col : ColumnObject
+ allow_copy : bool, default: True
+ Whether to allow copying the memory to perform the conversion
+ (if false then zero-copy approach is requested).
+
+ Returns
+ -------
+ pa.Array
+ """
+ if not allow_copy:
+ raise RuntimeError(
+ "Boolean column will be casted from uint8 and a copy "
+ "is required which is forbidden by allow_copy=False"
+ )
+
+ buffers = col.get_buffers()
+ data = buffers_to_array(buffers, col.size(),
+ col.describe_null,
+ col.offset)
+ data = pc.cast(data, pa.bool_())
+
+ return data
+
+
+def categorical_column_to_dictionary(
+ col: ColumnObject,
+ allow_copy: bool = True,
+) -> pa.DictionaryArray:
+ """
+ Convert a column holding categorical data to a pa.DictionaryArray.
+
+ Parameters
+ ----------
+ col : ColumnObject
+ allow_copy : bool, default: True
+ Whether to allow copying the memory to perform the conversion
+ (if false then zero-copy approach is requested).
+
+ Returns
+ -------
+ pa.DictionaryArray
+ """
+ if not allow_copy:
+ raise RuntimeError(
+ "Categorical column will be casted from uint8 and a copy "
+ "is required which is forbidden by allow_copy=False"
+ )
+
+ categorical = col.describe_categorical
+
+ if not categorical["is_dictionary"]:
+ raise NotImplementedError(
+ "Non-dictionary categoricals not supported yet")
+
+ cat_column = categorical["categories"]
+ dictionary = column_to_array(cat_column)
+
+ buffers = col.get_buffers()
+ indices = buffers_to_array(buffers, col.size(),
+ col.describe_null,
+ col.offset)
+
+ # Constructing a pa.DictionaryArray
+ dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
+
+ return dict_array
+
+
+def parse_datetime_format_str(format_str):
+ """Parse datetime `format_str` to interpret the `data`."""
+
+ # timestamp 'ts{unit}:tz'
+ timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
+ if timestamp_meta:
+ unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
+ if unit != "s":
+ # the format string describes only a first letter of the unit, so
+ # add one extra letter to convert the unit to numpy-style:
+ # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
+ unit += "s"
+
+ return unit, tz
+
+ raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
+
+
+def map_date_type(data_type):
+ """Map column date type to pyarrow date type. """
+ kind, bit_width, f_string, _ = data_type
+
+ if kind == DtypeKind.DATETIME:
+ unit, tz = parse_datetime_format_str(f_string)
+ return pa.timestamp(unit, tz=tz)
+ else:
+ pa_dtype = _PYARROW_DTYPES.get(kind, {}).get(bit_width, None)
+
+ # Error if dtype is not supported
+ if pa_dtype:
+ return pa_dtype
+ else:
+ raise NotImplementedError(
+ f"Conversion for {data_type} is not yet supported.")
+
+
+def buffers_to_array(
+ buffers: ColumnBuffers,
+ length: int,
+ describe_null: ColumnNullType,
+ offset: int = 0,
+ allow_copy: bool = True,
+) -> pa.Array:
+ """
+ Build a PyArrow array from the passed buffer.
+
+ Parameters
+ ----------
+ buffer : ColumnBuffers
+ Dictionary containing tuples of underlying buffers and
+ their associated dtype.
+ length : int
+ The number of values in the array.
+ describe_null: ColumnNullType
+ Null representation the column dtype uses,
+ as a tuple ``(kind, value)``
+ offset : int, default: 0
+ Number of elements to offset from the start of the buffer.
+ allow_copy : bool, default: True
+ Whether to allow copying the memory to perform the conversion
+ (if false then zero-copy approach is requested).
+
+ Returns
+ -------
+ pa.Array
+
+ Notes
+ -----
+ The returned array doesn't own the memory. The caller of this function
+ is responsible for keeping the memory owner object alive as long as
+ the returned PyArrow array is being used.
+ """
+ data_buff, data_type = buffers["data"]
+ try:
+ validity_buff, validity_dtype = buffers["validity"]
+ except TypeError:
+ validity_buff = None
+ try:
+ offset_buff, offset_dtype = buffers["offsets"]
+ except TypeError:
+ offset_buff = None
+
+ # Construct a pyarrow Buffer
+ data_pa_buffer = pa.foreign_buffer(data_buff.ptr, data_buff.bufsize,
+ base=data_buff)
+
+ # Construct a validity pyarrow Buffer, if applicable
+ if validity_buff:
+ validity_pa_buff = validity_buffer_from_mask(validity_buff,
+ validity_dtype,
+ describe_null,
+ length,
+ offset,
+ allow_copy)
+ else:
+ validity_pa_buff = validity_buffer_nan_sentinel(data_pa_buffer,
+ data_type,
+ describe_null,
+ length,
+ offset,
+ allow_copy)
+
+ # Construct a pyarrow Array from buffers
+ data_dtype = map_date_type(data_type)
+
+ if offset_buff:
+ _, offset_bit_width, _, _ = offset_dtype
+ # If an offset buffer exists, construct an offset pyarrow Buffer
+ # and add it to the construction of an array
+ offset_pa_buffer = pa.foreign_buffer(offset_buff.ptr,
+ offset_buff.bufsize,
+ base=offset_buff)
+
+ if data_type[2] == 'U':
+ string_type = pa.large_string()
+ else:
+ if offset_bit_width == 64:
+ string_type = pa.large_string()
+ else:
+ string_type = pa.string()
+ array = pa.Array.from_buffers(
+ string_type,
+ length,
+ [validity_pa_buff, offset_pa_buffer, data_pa_buffer],
+ offset=offset,
+ )
+ else:
+ array = pa.Array.from_buffers(
+ data_dtype,
+ length,
+ [validity_pa_buff, data_pa_buffer],
+ offset=offset,
+ )
+
+ return array
+
+
+def validity_buffer_from_mask(
+ validity_buff: BufferObject,
+ validity_dtype: Dtype,
+ describe_null: ColumnNullType,
+ length: int,
+ offset: int = 0,
+ allow_copy: bool = True,
+) -> pa.Buffer:
+ """
+ Build a PyArrow buffer from the passed mask buffer.
+
+ Parameters
+ ----------
+ validity_buff : BufferObject
+ Tuple of underlying validity buffer and associated dtype.
+ validity_dtype : Dtype
+ Dtype description as a tuple ``(kind, bit-width, format string,
+ endianness)``.
+ describe_null : ColumnNullType
+ Null representation the column dtype uses,
+ as a tuple ``(kind, value)``
+ length : int
+ The number of values in the array.
+ offset : int, default: 0
+ Number of elements to offset from the start of the buffer.
+ allow_copy : bool, default: True
+ Whether to allow copying the memory to perform the conversion
+ (if false then zero-copy approach is requested).
+
+ Returns
+ -------
+ pa.Buffer
+ """
+ null_kind, sentinel_val = describe_null
+ validity_kind, _, _, _ = validity_dtype
+ assert validity_kind == DtypeKind.BOOL
+
+ if null_kind == ColumnNullType.NON_NULLABLE:
+ # Sliced array can have a NON_NULLABLE ColumnNullType due
+ # to no missing values in that slice of an array though the bitmask
+ # exists and validity_buff must be set to None in this case
+ return None
+
+ elif null_kind == ColumnNullType.USE_BYTEMASK or (
+ null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 1
+ ):
+ buff = pa.foreign_buffer(validity_buff.ptr,
+ validity_buff.bufsize,
+ base=validity_buff)
+
+ if null_kind == ColumnNullType.USE_BYTEMASK:
+ if not allow_copy:
+ raise RuntimeError(
+ "To create a bitmask a copy of the data is "
+ "required which is forbidden by allow_copy=False"
+ )
+ mask = pa.Array.from_buffers(pa.int8(), length,
+ [None, buff],
+ offset=offset)
+ mask_bool = pc.cast(mask, pa.bool_())
+ else:
+ mask_bool = pa.Array.from_buffers(pa.bool_(), length,
+ [None, buff],
+ offset=offset)
+
+ if sentinel_val == 1:
+ mask_bool = pc.invert(mask_bool)
+
+ return mask_bool.buffers()[1]
+
+ elif null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 0:
+ return pa.foreign_buffer(validity_buff.ptr,
+ validity_buff.bufsize,
+ base=validity_buff)
+ else:
+ raise NotImplementedError(
+ f"{describe_null} null representation is not yet supported.")
+
+
+def validity_buffer_nan_sentinel(
+ data_pa_buffer: BufferObject,
+ data_type: Dtype,
+ describe_null: ColumnNullType,
+ length: int,
+ offset: int = 0,
+ allow_copy: bool = True,
+) -> pa.Buffer:
+ """
+ Build a PyArrow buffer from NaN or sentinel values.
+
+ Parameters
+ ----------
+ data_pa_buffer : pa.Buffer
+ PyArrow buffer for the column data.
+ data_type : Dtype
+ Dtype description as a tuple ``(kind, bit-width, format string,
+ endianness)``.
+ describe_null : ColumnNullType
+ Null representation the column dtype uses,
+ as a tuple ``(kind, value)``
+ length : int
+ The number of values in the array.
+ offset : int, default: 0
+ Number of elements to offset from the start of the buffer.
+ allow_copy : bool, default: True
+ Whether to allow copying the memory to perform the conversion
+ (if false then zero-copy approach is requested).
+
+ Returns
+ -------
+ pa.Buffer
+ """
+ kind, bit_width, _, _ = data_type
+ data_dtype = map_date_type(data_type)
+ null_kind, sentinel_val = describe_null
+
+ # Check for float NaN values
+ if null_kind == ColumnNullType.USE_NAN:
+ if not allow_copy:
+ raise RuntimeError(
+ "To create a bitmask a copy of the data is "
+ "required which is forbidden by allow_copy=False"
+ )
+
+ if kind == DtypeKind.FLOAT and bit_width == 16:
+ # 'pyarrow.compute.is_nan' kernel not yet implemented
+ # for float16
+ raise NotImplementedError(
+ f"{data_type} with {null_kind} is not yet supported.")
+ else:
+ pyarrow_data = pa.Array.from_buffers(
+ data_dtype,
+ length,
+ [None, data_pa_buffer],
+ offset=offset,
+ )
+ mask = pc.is_nan(pyarrow_data)
+ mask = pc.invert(mask)
+ return mask.buffers()[1]
+
+ # Check for sentinel values
+ elif null_kind == ColumnNullType.USE_SENTINEL:
+ if not allow_copy:
+ raise RuntimeError(
+ "To create a bitmask a copy of the data is "
+ "required which is forbidden by allow_copy=False"
+ )
+
+ if kind == DtypeKind.DATETIME:
+ sentinel_dtype = pa.int64()
+ else:
+ sentinel_dtype = data_dtype
+ pyarrow_data = pa.Array.from_buffers(sentinel_dtype,
+ length,
+ [None, data_pa_buffer],
+ offset=offset)
+ sentinel_arr = pc.equal(pyarrow_data, sentinel_val)
+ mask_bool = pc.invert(sentinel_arr)
+ return mask_bool.buffers()[1]
+
+ elif null_kind == ColumnNullType.NON_NULLABLE:
+ pass
+ else:
+ raise NotImplementedError(
+ f"{describe_null} null representation is not yet supported.")
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index c97ad027dd..35492d4f64 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -2861,6 +2861,36 @@ cdef class Table(_PandasConvertible):
return self.column(key)
+ # ----------------------------------------------------------------------
+ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
+ """
+ Return the dataframe interchange object implementing the interchange protocol.
+ Parameters
+ ----------
+ nan_as_null : bool, default False
+ Whether to tell the DataFrame to overwrite null values in the data
+ with ``NaN`` (or ``NaT``).
+ allow_copy : bool, default True
+ Whether to allow memory copying when exporting. If set to False
+ it would cause non-zero-copy exports to fail.
+ Returns
+ -------
+ DataFrame interchange object
+ The object which consuming library can use to ingress the dataframe.
+ Notes
+ -----
+ Details on the interchange protocol:
+ https://data-apis.org/dataframe-protocol/latest/index.html
+ `nan_as_null` currently has no effect; once support for nullable extension
+ dtypes is added, this value should be propagated to columns.
+ """
+
+ from pyarrow.interchange.dataframe import _PyArrowDataFrame
+
+ return _PyArrowDataFrame(self, nan_as_null, allow_copy)
+
+ # ----------------------------------------------------------------------
+
def slice(self, offset=0, length=None):
"""
Compute zero-copy slice of this Table.
diff --git a/python/pyarrow/tests/interchange/__init__.py b/python/pyarrow/tests/interchange/__init__.py
new file mode 100644
index 0000000000..13a83393a9
--- /dev/null
+++ b/python/pyarrow/tests/interchange/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py
new file mode 100644
index 0000000000..0680d9c4ec
--- /dev/null
+++ b/python/pyarrow/tests/interchange/test_conversion.py
@@ -0,0 +1,524 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from datetime import datetime as dt
+import numpy as np
+import pyarrow as pa
+from pyarrow.vendored.version import Version
+import pytest
+
+import pyarrow.interchange as pi
+from pyarrow.interchange.column import (
+ _PyArrowColumn,
+ ColumnNullType,
+ DtypeKind,
+)
+from pyarrow.interchange.from_dataframe import _from_dataframe
+
+try:
+ import pandas as pd
+ # import pandas.testing as tm
+except ImportError:
+ pass
+
+
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+@pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30'])
+def test_datetime(unit, tz):
+ dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), None]
+ table = pa.table({"A": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))})
+ col = table.__dataframe__().get_column_by_name("A")
+
+ assert col.size() == 3
+ assert col.offset == 0
+ assert col.null_count == 1
+ assert col.dtype[0] == DtypeKind.DATETIME
+ assert col.describe_null == (ColumnNullType.USE_BITMASK, 0)
+
+
+@pytest.mark.parametrize(
+ ["test_data", "kind"],
+ [
+ (["foo", "bar"], 21),
+ ([1.5, 2.5, 3.5], 2),
+ ([1, 2, 3, 4], 0),
+ ],
+)
+def test_array_to_pyarrowcolumn(test_data, kind):
+ arr = pa.array(test_data)
+ arr_column = _PyArrowColumn(arr)
+
+ assert arr_column._col == arr
+ assert arr_column.size() == len(test_data)
+ assert arr_column.dtype[0] == kind
+ assert arr_column.num_chunks() == 1
+ assert arr_column.null_count == 0
+ assert arr_column.get_buffers()["validity"] is None
+ assert len(list(arr_column.get_chunks())) == 1
+
+ for chunk in arr_column.get_chunks():
+ assert chunk == arr_column
+
+
+def test_offset_of_sliced_array():
+ arr = pa.array([1, 2, 3, 4])
+ arr_sliced = arr.slice(2, 2)
+
+ table = pa.table([arr], names=["arr"])
+ table_sliced = pa.table([arr_sliced], names=["arr_sliced"])
+
+ col = table_sliced.__dataframe__().get_column(0)
+ assert col.offset == 2
+
+ result = _from_dataframe(table_sliced.__dataframe__())
+ assert table_sliced.equals(result)
+ assert not table.equals(result)
+
+ # pandas hardcodes offset to 0:
+ # https://github.com/pandas-dev/pandas/blob/5c66e65d7b9fef47ccb585ce2fd0b3ea18dc82ea/pandas/core/interchange/from_dataframe.py#L247
+ # so conversion to pandas can't be tested currently
+
+ # df = pandas_from_dataframe(table)
+ # df_sliced = pandas_from_dataframe(table_sliced)
+
+ # tm.assert_series_equal(df["arr"][2:4], df_sliced["arr_sliced"],
+ # check_index=False, check_names=False)
+
+
+# Currently errors due to string conversion
+# as col.size is called as a property not method in pandas
+# see L255-L257 in pandas/core/interchange/from_dataframe.py
+@pytest.mark.pandas
+def test_categorical_roundtrip():
+ pytest.skip("Bug in pandas implementation")
+
+ if Version(pd.__version__) < Version("1.5.0"):
+ pytest.skip("__dataframe__ added to pandas in 1.5.0")
+ arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]
+ table = pa.table(
+ {"weekday": pa.array(arr).dictionary_encode()}
+ )
+
+ pandas_df = table.to_pandas()
+ result = pi.from_dataframe(pandas_df)
+
+ # Checking equality for the values
+ # As the dtype of the indices is changed from int32 in pa.Table
+ # to int64 in pandas interchange protocol implementation
+ assert result[0].chunk(0).dictionary == table[0].chunk(0).dictionary
+
+ table_protocol = table.__dataframe__()
+ result_protocol = result.__dataframe__()
+
+ assert table_protocol.num_columns() == result_protocol.num_columns()
+ assert table_protocol.num_rows() == result_protocol.num_rows()
+ assert table_protocol.num_chunks() == result_protocol.num_chunks()
+ assert table_protocol.column_names() == result_protocol.column_names()
+
+ col_table = table_protocol.get_column(0)
+ col_result = result_protocol.get_column(0)
+
+ assert col_result.dtype[0] == DtypeKind.CATEGORICAL
+ assert col_result.dtype[0] == col_table.dtype[0]
+ assert col_result.size == col_table.size
+ assert col_result.offset == col_table.offset
+
+ desc_cat_table = col_result.describe_categorical
+ desc_cat_result = col_result.describe_categorical
+
+ assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
+ assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
+ assert isinstance(desc_cat_result["categories"]._col, pa.Array)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize(
+ "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
+)
+@pytest.mark.parametrize(
+ "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+@pytest.mark.parametrize(
+ "float, np_float", [
+ # (pa.float16(), np.float16), #not supported by pandas
+ (pa.float32(), np.float32),
+ (pa.float64(), np.float64)
+ ]
+)
+def test_pandas_roundtrip(uint, int, float, np_float):
+ if Version(pd.__version__) < Version("1.5.0"):
+ pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+ arr = [1, 2, 3]
+ table = pa.table(
+ {
+ "a": pa.array(arr, type=uint),
+ "b": pa.array(arr, type=int),
+ "c": pa.array(np.array(arr, dtype=np_float), type=float),
+ }
+ )
+ from pandas.api.interchange import (
+ from_dataframe as pandas_from_dataframe
+ )
+ pandas_df = pandas_from_dataframe(table)
+ result = pi.from_dataframe(pandas_df)
+ assert table.equals(result)
+
+ table_protocol = table.__dataframe__()
+ result_protocol = result.__dataframe__()
+
+ assert table_protocol.num_columns() == result_protocol.num_columns()
+ assert table_protocol.num_rows() == result_protocol.num_rows()
+ assert table_protocol.num_chunks() == result_protocol.num_chunks()
+ assert table_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.pandas
+def test_roundtrip_pandas_string():
+ # See https://github.com/pandas-dev/pandas/issues/50554
+ if Version(pd.__version__) < Version("1.6"):
+ pytest.skip(" Column.size() called as a method in pandas 2.0.0")
+
+ # large string is not supported by pandas implementation
+ table = pa.table({"a": pa.array(["a", "", "c"])})
+
+ from pandas.api.interchange import (
+ from_dataframe as pandas_from_dataframe
+ )
+ pandas_df = pandas_from_dataframe(table)
+ result = pi.from_dataframe(pandas_df)
+
+ assert result[0].to_pylist() == table[0].to_pylist()
+ assert pa.types.is_string(table[0].type)
+ assert pa.types.is_large_string(result[0].type)
+
+ table_protocol = table.__dataframe__()
+ result_protocol = result.__dataframe__()
+
+ assert table_protocol.num_columns() == result_protocol.num_columns()
+ assert table_protocol.num_rows() == result_protocol.num_rows()
+ assert table_protocol.num_chunks() == result_protocol.num_chunks()
+ assert table_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.pandas
+def test_roundtrip_pandas_boolean():
+ if Version(pd.__version__) < Version("1.5.0"):
+ pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+ table = pa.table({"a": [True, False, True]})
+
+ from pandas.api.interchange import (
+ from_dataframe as pandas_from_dataframe
+ )
+ pandas_df = pandas_from_dataframe(table)
+ result = pi.from_dataframe(pandas_df)
+
+ assert table.equals(result)
+
+ table_protocol = table.__dataframe__()
+ result_protocol = result.__dataframe__()
+
+ assert table_protocol.num_columns() == result_protocol.num_columns()
+ assert table_protocol.num_rows() == result_protocol.num_rows()
+ assert table_protocol.num_chunks() == result_protocol.num_chunks()
+ assert table_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+def test_roundtrip_pandas_datetime(unit):
+ if Version(pd.__version__) < Version("1.5.0"):
+ pytest.skip("__dataframe__ added to pandas in 1.5.0")
+ from datetime import datetime as dt
+
+ # timezones not included as they are not yet supported in
+ # the pandas implementation
+ dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), dt(2007, 7, 15)]
+ table = pa.table({"a": pa.array(dt_arr, type=pa.timestamp(unit))})
+
+ if Version(pd.__version__) < Version("1.6"):
+ # pandas < 2.0 always creates datetime64 in "ns"
+ # resolution
+ expected = pa.table({"a": pa.array(dt_arr, type=pa.timestamp('ns'))})
+ else:
+ expected = table
+
+ from pandas.api.interchange import (
+ from_dataframe as pandas_from_dataframe
+ )
+ pandas_df = pandas_from_dataframe(table)
+ result = pi.from_dataframe(pandas_df)
+
+ assert expected.equals(result)
+
+ expected_protocol = expected.__dataframe__()
+ result_protocol = result.__dataframe__()
+
+ assert expected_protocol.num_columns() == result_protocol.num_columns()
+ assert expected_protocol.num_rows() == result_protocol.num_rows()
+ assert expected_protocol.num_chunks() == result_protocol.num_chunks()
+ assert expected_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.large_memory
+@pytest.mark.pandas
+def test_pandas_assertion_error_large_string():
+ # Test AssertionError as pandas does not support "U" type strings
+ if Version(pd.__version__) < Version("1.5.0"):
+ pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+ data = np.array([b'x'*1024]*(3*1024**2), dtype='object') # 3GB bytes data
+ arr = pa.array(data, type=pa.large_string())
+ table = pa.table([arr], names=["large_string"])
+
+ from pandas.api.interchange import (
+ from_dataframe as pandas_from_dataframe
+ )
+
+ with pytest.raises(AssertionError):
+ pandas_from_dataframe(table)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize(
+ "np_float", [np.float32, np.float64]
+)
+def test_pandas_to_pyarrow_with_missing(np_float):
+ if Version(pd.__version__) < Version("1.5.0"):
+ pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+ np_array = np.array([0, np.nan, 2], dtype=np_float)
+ datetime_array = [None, dt(2007, 7, 14), dt(2007, 7, 15)]
+ df = pd.DataFrame({
+ "a": np_array, # float, ColumnNullType.USE_NAN
+ "dt": datetime_array # ColumnNullType.USE_SENTINEL
+ })
+ expected = pa.table({
+ "a": pa.array(np_array, from_pandas=True),
+ "dt": pa.array(datetime_array, type=pa.timestamp("ns"))
+ })
+ result = pi.from_dataframe(df)
+
+ assert result.equals(expected)
+
+
+@pytest.mark.pandas
+def test_pandas_to_pyarrow_float16_with_missing():
+ if Version(pd.__version__) < Version("1.5.0"):
+ pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+ # np.float16 errors if ps.is_nan is used
+ # pyarrow.lib.ArrowNotImplementedError: Function 'is_nan' has no kernel
+ # matching input types (halffloat)
+ np_array = np.array([0, np.nan, 2], dtype=np.float16)
+ df = pd.DataFrame({"a": np_array})
+
+ with pytest.raises(NotImplementedError):
+ pi.from_dataframe(df)
+
+
+@pytest.mark.pandas
+def test_pandas_to_pyarrow_string_with_missing():
+ if Version(pd.__version__) < Version("1.5.0"):
+ pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+ # pandas is using int64 offsets for string dtype so the constructed
+ # pyarrow string column will always be a large_string data type
+ arr = {
+ "Y": ["a", "b", None], # bool, ColumnNullType.USE_BYTEMASK,
+ }
+ df = pd.DataFrame(arr)
+ expected = pa.table(arr)
+ result = pi.from_dataframe(df)
+
+ assert result[0].to_pylist() == expected[0].to_pylist()
+ assert pa.types.is_string(expected[0].type)
+ assert pa.types.is_large_string(result[0].type)
+
+
+@pytest.mark.pandas
+def test_pandas_to_pyarrow_categorical_with_missing():
+ if Version(pd.__version__) < Version("1.5.0"):
+ pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+ arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None]
+ df = pd.DataFrame(
+ {"weekday": arr}
+ )
+ df = df.astype("category")
+ result = pi.from_dataframe(df)
+
+ expected_dictionary = ["Fri", "Mon", "Sat", "Thu", "Tue", "Wed"]
+ expected_indices = pa.array([1, 4, 1, 5, 1, 3, 0, 2, None], type=pa.int8())
+
+ assert result[0].to_pylist() == arr
+ assert result[0].chunk(0).dictionary.to_pylist() == expected_dictionary
+ assert result[0].chunk(0).indices.equals(expected_indices)
+
+
+@pytest.mark.parametrize(
+ "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
+)
+@pytest.mark.parametrize(
+ "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+@pytest.mark.parametrize(
+ "float, np_float", [
+ (pa.float16(), np.float16),
+ (pa.float32(), np.float32),
+ (pa.float64(), np.float64)
+ ]
+)
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+@pytest.mark.parametrize("tz", ['America/New_York', '+07:30', '-04:30'])
+@pytest.mark.parametrize("offset, length", [(0, 3), (0, 2), (1, 2), (2, 1)])
+def test_pyarrow_roundtrip(uint, int, float, np_float,
+ unit, tz, offset, length):
+
+ from datetime import datetime as dt
+ arr = [1, 2, None]
+ dt_arr = [dt(2007, 7, 13), None, dt(2007, 7, 15)]
+
+ table = pa.table(
+ {
+ "a": pa.array(arr, type=uint),
+ "b": pa.array(arr, type=int),
+ "c": pa.array(np.array(arr, dtype=np_float),
+ type=float, from_pandas=True),
+ "d": [True, False, True],
+ "e": [True, False, None],
+ "f": ["a", None, "c"],
+ "g": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))
+ }
+ )
+ table = table.slice(offset, length)
+ result = _from_dataframe(table.__dataframe__())
+
+ assert table.equals(result)
+
+ table_protocol = table.__dataframe__()
+ result_protocol = result.__dataframe__()
+
+ assert table_protocol.num_columns() == result_protocol.num_columns()
+ assert table_protocol.num_rows() == result_protocol.num_rows()
+ assert table_protocol.num_chunks() == result_protocol.num_chunks()
+ assert table_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.parametrize("offset, length", [(0, 10), (0, 2), (7, 3), (2, 1)])
+def test_pyarrow_roundtrip_categorical(offset, length):
+ arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", None, "Sun"]
+ table = pa.table(
+ {"weekday": pa.array(arr).dictionary_encode()}
+ )
+ table = table.slice(offset, length)
+ result = _from_dataframe(table.__dataframe__())
+
+ assert table.equals(result)
+
+ table_protocol = table.__dataframe__()
+ result_protocol = result.__dataframe__()
+
+ assert table_protocol.num_columns() == result_protocol.num_columns()
+ assert table_protocol.num_rows() == result_protocol.num_rows()
+ assert table_protocol.num_chunks() == result_protocol.num_chunks()
+ assert table_protocol.column_names() == result_protocol.column_names()
+
+ col_table = table_protocol.get_column(0)
+ col_result = result_protocol.get_column(0)
+
+ assert col_result.dtype[0] == DtypeKind.CATEGORICAL
+ assert col_result.dtype[0] == col_table.dtype[0]
+ assert col_result.size() == col_table.size()
+ assert col_result.offset == col_table.offset
+
+ desc_cat_table = col_result.describe_categorical
+ desc_cat_result = col_result.describe_categorical
+
+ assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
+ assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
+ assert isinstance(desc_cat_result["categories"]._col, pa.Array)
+
+
+@pytest.mark.large_memory
+def test_pyarrow_roundtrip_large_string():
+
+ data = np.array([b'x'*1024]*(3*1024**2), dtype='object') # 3GB bytes data
+ arr = pa.array(data, type=pa.large_string())
+ table = pa.table([arr], names=["large_string"])
+
+ result = _from_dataframe(table.__dataframe__())
+ col = result.__dataframe__().get_column(0)
+
+ assert col.size() == 3*1024**2
+ assert pa.types.is_large_string(table[0].type)
+ assert pa.types.is_large_string(result[0].type)
+
+ assert table.equals(result)
+
+
+def test_nan_as_null():
+ table = pa.table({"a": [1, 2, 3, 4]})
+ with pytest.raises(RuntimeError):
+ table.__dataframe__(nan_as_null=True)
+
+
+@pytest.mark.pandas
+def test_allow_copy_false():
+ if Version(pd.__version__) < Version("1.5.0"):
+ pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+ # Test that an error is raised when a copy is needed
+ # to create a bitmask
+
+ df = pd.DataFrame({"a": [0, 1.0, 2.0]})
+ with pytest.raises(RuntimeError):
+ pi.from_dataframe(df, allow_copy=False)
+
+ df = pd.DataFrame({
+ "dt": [None, dt(2007, 7, 14), dt(2007, 7, 15)]
+ })
+ with pytest.raises(RuntimeError):
+ pi.from_dataframe(df, allow_copy=False)
+
+
+@pytest.mark.pandas
+def test_allow_copy_false_bool_categorical():
+ if Version(pd.__version__) < Version("1.5.0"):
+ pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+ # Test that an error is raised for boolean
+ # and categorical dtype (copy is always made)
+
+ df = pd.DataFrame({"a": [None, False, True]})
+ with pytest.raises(RuntimeError):
+ pi.from_dataframe(df, allow_copy=False)
+
+ df = pd.DataFrame({"a": [True, False, True]})
+ with pytest.raises(RuntimeError):
+ pi.from_dataframe(df, allow_copy=False)
+
+ df = pd.DataFrame({"weekday": ["a", "b", None]})
+ df = df.astype("category")
+ with pytest.raises(RuntimeError):
+ pi.from_dataframe(df, allow_copy=False)
+
+ df = pd.DataFrame({"weekday": ["a", "b", "c"]})
+ df = df.astype("category")
+ with pytest.raises(RuntimeError):
+ pi.from_dataframe(df, allow_copy=False)
diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py
new file mode 100644
index 0000000000..42ec805359
--- /dev/null
+++ b/python/pyarrow/tests/interchange/test_interchange_spec.py
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+import hypothesis as h
+import hypothesis.strategies as st
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.tests.strategies as past
+import pytest
+
+
+all_types = st.deferred(
+ lambda: (
+ past.signed_integer_types |
+ past.unsigned_integer_types |
+ past.floating_types |
+ past.bool_type |
+ past.string_type |
+ past.large_string_type
+ )
+)
+
+
+# datetime is tested in test_extra.py
+# dictionary is tested in test_categorical()
+@h.given(past.arrays(all_types, size=3))
+def test_dtypes(arr):
+ table = pa.table([arr], names=["a"])
+ df = table.__dataframe__()
+
+ null_count = df.get_column(0).null_count
+ assert null_count == arr.null_count
+ assert isinstance(null_count, int)
+ assert df.get_column(0).size() == 3
+ assert df.get_column(0).offset == 0
+
+
+@pytest.mark.parametrize(
+ "uint, uint_bw",
+ [
+ (pa.uint8(), 8),
+ (pa.uint16(), 16),
+ (pa.uint32(), 32)
+ ]
+)
+@pytest.mark.parametrize(
+ "int, int_bw", [
+ (pa.int8(), 8),
+ (pa.int16(), 16),
+ (pa.int32(), 32),
+ (pa.int64(), 64)
+ ]
+)
+@pytest.mark.parametrize(
+ "float, float_bw, np_float", [
+ (pa.float16(), 16, np.float16),
+ (pa.float32(), 32, np.float32),
+ (pa.float64(), 64, np.float64)
+ ]
+)
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+@pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30'])
+def test_mixed_dtypes(uint, uint_bw, int, int_bw,
+ float, float_bw, np_float, unit, tz):
+ from datetime import datetime as dt
+ arr = [1, 2, 3]
+ dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), dt(2007, 7, 15)]
+ table = pa.table(
+ {
+ "a": pa.array(arr, type=uint),
+ "b": pa.array(arr, type=int),
+ "c": pa.array(np.array(arr, dtype=np_float), type=float),
+ "d": [True, False, True],
+ "e": ["a", "", "c"],
+ "f": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))
+ }
+ )
+ df = table.__dataframe__()
+ # 0 = DtypeKind.INT, 1 = DtypeKind.UINT, 2 = DtypeKind.FLOAT,
+ # 20 = DtypeKind.BOOL, 21 = DtypeKind.STRING, 22 = DtypeKind.DATETIME
+ # see DtypeKind class in column.py
+ columns = {"a": 1, "b": 0, "c": 2, "d": 20, "e": 21, "f": 22}
+
+ for column, kind in columns.items():
+ col = df.get_column_by_name(column)
+
+ assert col.null_count == 0
+ assert col.size() == 3
+ assert col.offset == 0
+ assert col.dtype[0] == kind
+
+ assert df.get_column_by_name("a").dtype[1] == uint_bw
+ assert df.get_column_by_name("b").dtype[1] == int_bw
+ assert df.get_column_by_name("c").dtype[1] == float_bw
+
+
+def test_na_float():
+ table = pa.table({"a": [1.0, None, 2.0]})
+ df = table.__dataframe__()
+ col = df.get_column_by_name("a")
+ assert col.null_count == 1
+ assert isinstance(col.null_count, int)
+
+
+def test_noncategorical():
+ table = pa.table({"a": [1, 2, 3]})
+ df = table.__dataframe__()
+ col = df.get_column_by_name("a")
+ with pytest.raises(TypeError, match=".*categorical.*"):
+ col.describe_categorical
+
+
+def test_categorical():
+ import pyarrow as pa
+ arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None]
+ table = pa.table(
+ {"weekday": pa.array(arr).dictionary_encode()}
+ )
+
+ col = table.__dataframe__().get_column_by_name("weekday")
+ categorical = col.describe_categorical
+ assert isinstance(categorical["is_ordered"], bool)
+ assert isinstance(categorical["is_dictionary"], bool)
+
+
+def test_dataframe():
+ n = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+ a = pa.chunked_array([["Flamingo", "Parrot", "Cow"],
+ ["Horse", "Brittle stars", "Centipede"]])
+ table = pa.table([n, a], names=['n_legs', 'animals'])
+ df = table.__dataframe__()
+
+ assert df.num_columns() == 2
+ assert df.num_rows() == 6
+ assert df.num_chunks() == 2
+ assert list(df.column_names()) == ['n_legs', 'animals']
+ assert list(df.select_columns((1,)).column_names()) == list(
+ df.select_columns_by_name(("animals",)).column_names()
+ )
+
+
+@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
+def test_df_get_chunks(size, n_chunks):
+ table = pa.table({"x": list(range(size))})
+ df = table.__dataframe__()
+ chunks = list(df.get_chunks(n_chunks))
+ assert len(chunks) == n_chunks
+ assert sum(chunk.num_rows() for chunk in chunks) == size
+
+
+@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
+def test_column_get_chunks(size, n_chunks):
+ table = pa.table({"x": list(range(size))})
+ df = table.__dataframe__()
+ chunks = list(df.get_column(0).get_chunks(n_chunks))
+ assert len(chunks) == n_chunks
+ assert sum(chunk.size() for chunk in chunks) == size
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize(
+ "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
+)
+@pytest.mark.parametrize(
+ "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+@pytest.mark.parametrize(
+ "float, np_float", [
+ (pa.float16(), np.float16),
+ (pa.float32(), np.float32),
+ (pa.float64(), np.float64)
+ ]
+)
+def test_get_columns(uint, int, float, np_float):
+ arr = [[1, 2, 3], [4, 5]]
+ arr_float = np.array([1, 2, 3, 4, 5], dtype=np_float)
+ table = pa.table(
+ {
+ "a": pa.chunked_array(arr, type=uint),
+ "b": pa.chunked_array(arr, type=int),
+ "c": pa.array(arr_float, type=float)
+ }
+ )
+ df = table.__dataframe__()
+ for col in df.get_columns():
+ assert col.size() == 5
+ assert col.num_chunks() == 1
+
+ # 0 = DtypeKind.INT, 1 = DtypeKind.UINT, 2 = DtypeKind.FLOAT,
+ # see DtypeKind class in column.py
+ assert df.get_column(0).dtype[0] == 1 # UINT
+ assert df.get_column(1).dtype[0] == 0 # INT
+ assert df.get_column(2).dtype[0] == 2 # FLOAT
+
+
+@pytest.mark.parametrize(
+ "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+def test_buffer(int):
+ arr = [0, 1, -1]
+ table = pa.table({"a": pa.array(arr, type=int)})
+ df = table.__dataframe__()
+ col = df.get_column(0)
+ buf = col.get_buffers()
+
+ dataBuf, dataDtype = buf["data"]
+
+ assert dataBuf.bufsize > 0
+ assert dataBuf.ptr != 0
+ device, _ = dataBuf.__dlpack_device__()
+
+ # 0 = DtypeKind.INT
+ # see DtypeKind class in column.py
+ assert dataDtype[0] == 0
+
+ if device == 1: # CPU-only as we're going to directly read memory here
+ bitwidth = dataDtype[1]
+ ctype = {
+ 8: ctypes.c_int8,
+ 16: ctypes.c_int16,
+ 32: ctypes.c_int32,
+ 64: ctypes.c_int64,
+ }[bitwidth]
+
+ for idx, truth in enumerate(arr):
+ val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value
+ assert val == truth, f"Buffer at index {idx} mismatch"