You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2023/01/13 10:10:53 UTC
[arrow] branch master updated: GH-33346: [Python] DataFrame Interchange Protocol for pyarrow Table (#14804)

This is an automated email from the ASF dual-hosted git repository.

jorisvandenbossche pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new a83cc852ad GH-33346: [Python] DataFrame Interchange Protocol for pyarrow Table (#14804)
a83cc852ad is described below

commit a83cc852addd4d6946ead700b0f8c95b6b14f406
Author: Alenka Frim <Al...@users.noreply.github.com>
AuthorDate: Fri Jan 13 11:10:42 2023 +0100

    GH-33346: [Python] DataFrame Interchange Protocol for pyarrow Table (#14804)
    
    This PR implements the Dataframe Interchange Protocol for `pyarrow.Table`.
    See: https://data-apis.org/dataframe-protocol/latest/index.html
    
    Lead-authored-by: Alenka Frim <fr...@gmail.com>
    Co-authored-by: Alenka Frim <Al...@users.noreply.github.com>
    Co-authored-by: Joris Van den Bossche <jo...@gmail.com>
    Signed-off-by: Joris Van den Bossche <jo...@gmail.com>
---
 python/pyarrow/interchange/__init__.py             |  20 +
 python/pyarrow/interchange/buffer.py               | 107 ++++
 python/pyarrow/interchange/column.py               | 527 +++++++++++++++++++
 python/pyarrow/interchange/dataframe.py            | 202 ++++++++
 python/pyarrow/interchange/from_dataframe.py       | 567 +++++++++++++++++++++
 python/pyarrow/table.pxi                           |  30 ++
 python/pyarrow/tests/interchange/__init__.py       |  16 +
 .../pyarrow/tests/interchange/test_conversion.py   | 524 +++++++++++++++++++
 .../tests/interchange/test_interchange_spec.py     | 243 +++++++++
 9 files changed, 2236 insertions(+)

diff --git a/python/pyarrow/interchange/__init__.py b/python/pyarrow/interchange/__init__.py
new file mode 100644
index 0000000000..7ebe59b499
--- /dev/null
+++ b/python/pyarrow/interchange/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# flake8: noqa
+
+from .from_dataframe import from_dataframe
diff --git a/python/pyarrow/interchange/buffer.py b/python/pyarrow/interchange/buffer.py
new file mode 100644
index 0000000000..1f53779813
--- /dev/null
+++ b/python/pyarrow/interchange/buffer.py
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+import enum
+
+import pyarrow as pa
+
+
+class DlpackDeviceType(enum.IntEnum):
+    """Integer enum for device type codes matching DLPack."""
+
+    CPU = 1
+    CUDA = 2
+    CPU_PINNED = 3
+    OPENCL = 4
+    VULKAN = 7
+    METAL = 8
+    VPI = 9
+    ROCM = 10
+
+
+class _PyArrowBuffer:
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+
+    Note that there is no dtype attribute present, a buffer can be thought of
+    as simply a block of memory. However, if the column that the buffer is
+    attached to has a dtype that's supported by DLPack and ``__dlpack__`` is
+    implemented, then that dtype information will be contained in the return
+    value from ``__dlpack__``.
+
+    This distinction is useful to support both data exchange via DLPack on a
+    buffer and (b) dtypes like variable-length strings which do not have a
+    fixed number of bytes per element.
+    """
+
+    def __init__(self, x: pa.Buffer, allow_copy: bool = True) -> None:
+        """
+        Handle PyArrow Buffers.
+        """
+        self._x = x
+
+    @property
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes.
+        """
+        return self._x.size
+
+    @property
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer.
+        """
+        return self._x.address
+
+    def __dlpack__(self):
+        """
+        Produce DLPack capsule (see array API standard).
+
+        Raises:
+            - TypeError : if the buffer contains unsupported dtypes.
+            - NotImplementedError : if DLPack support is not implemented
+
+        Useful to have to connect to array libraries. Support optional because
+        it's not completely trivial to implement for a Python-only library.
+        """
+        raise NotImplementedError("__dlpack__")
+
+    def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
+        """
+        Device type and device ID for where the data in the buffer resides.
+        Uses device type codes matching DLPack.
+        Note: must be implemented even if ``__dlpack__`` is not.
+        """
+        if self._x.is_cpu:
+            return (DlpackDeviceType.CPU, None)
+        else:
+            raise NotImplementedError("__dlpack_device__")
+
+    def __repr__(self) -> str:
+        return (
+            "PyArrowBuffer(" +
+            str(
+                {
+                    "bufsize": self.bufsize,
+                    "ptr": self.ptr,
+                    "device": self.__dlpack_device__()[0].name,
+                }
+            ) +
+            ")"
+        )
diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py
new file mode 100644
index 0000000000..a9b8958616
--- /dev/null
+++ b/python/pyarrow/interchange/column.py
@@ -0,0 +1,527 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+
+import enum
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Optional,
+    Tuple,
+)
+
+import sys
+if sys.version_info >= (3, 8):
+    from typing import TypedDict
+else:
+    from typing_extensions import TypedDict
+
+import pyarrow as pa
+import pyarrow.compute as pc
+from pyarrow.interchange.buffer import _PyArrowBuffer
+
+
+class DtypeKind(enum.IntEnum):
+    """
+    Integer enum for data types.
+
+    Attributes
+    ----------
+    INT : int
+        Matches to signed integer data type.
+    UINT : int
+        Matches to unsigned integer data type.
+    FLOAT : int
+        Matches to floating point data type.
+    BOOL : int
+        Matches to boolean data type.
+    STRING : int
+        Matches to string data type (UTF-8 encoded).
+    DATETIME : int
+        Matches to datetime data type.
+    CATEGORICAL : int
+        Matches to categorical data type.
+    """
+
+    INT = 0
+    UINT = 1
+    FLOAT = 2
+    BOOL = 20
+    STRING = 21  # UTF-8
+    DATETIME = 22
+    CATEGORICAL = 23
+
+
+Dtype = Tuple[DtypeKind, int, str, str]  # see Column.dtype
+
+
+_PYARROW_KINDS = {
+    pa.int8(): (DtypeKind.INT, "c"),
+    pa.int16(): (DtypeKind.INT, "s"),
+    pa.int32(): (DtypeKind.INT, "i"),
+    pa.int64(): (DtypeKind.INT, "l"),
+    pa.uint8(): (DtypeKind.UINT, "C"),
+    pa.uint16(): (DtypeKind.UINT, "S"),
+    pa.uint32(): (DtypeKind.UINT, "I"),
+    pa.uint64(): (DtypeKind.UINT, "L"),
+    pa.float16(): (DtypeKind.FLOAT, "e"),
+    pa.float32(): (DtypeKind.FLOAT, "f"),
+    pa.float64(): (DtypeKind.FLOAT, "g"),
+    pa.bool_(): (DtypeKind.BOOL, "b"),
+    pa.string(): (DtypeKind.STRING, "u"),
+    pa.large_string(): (DtypeKind.STRING, "U"),
+}
+
+
+class ColumnNullType(enum.IntEnum):
+    """
+    Integer enum for null type representation.
+
+    Attributes
+    ----------
+    NON_NULLABLE : int
+        Non-nullable column.
+    USE_NAN : int
+        Use explicit float NaN value.
+    USE_SENTINEL : int
+        Sentinel value besides NaN.
+    USE_BITMASK : int
+        The bit is set/unset representing a null on a certain position.
+    USE_BYTEMASK : int
+        The byte is set/unset representing a null on a certain position.
+    """
+
+    NON_NULLABLE = 0
+    USE_NAN = 1
+    USE_SENTINEL = 2
+    USE_BITMASK = 3
+    USE_BYTEMASK = 4
+
+
+class ColumnBuffers(TypedDict):
+    # first element is a buffer containing the column data;
+    # second element is the data buffer's associated dtype
+    data: Tuple[_PyArrowBuffer, Dtype]
+
+    # first element is a buffer containing mask values indicating missing data;
+    # second element is the mask value buffer's associated dtype.
+    # None if the null representation is not a bit or byte mask
+    validity: Optional[Tuple[_PyArrowBuffer, Dtype]]
+
+    # first element is a buffer containing the offset values for
+    # variable-size binary data (e.g., variable-length strings);
+    # second element is the offsets buffer's associated dtype.
+    # None if the data buffer does not have an associated offsets buffer
+    offsets: Optional[Tuple[_PyArrowBuffer, Dtype]]
+
+
+class CategoricalDescription(TypedDict):
+    # whether the ordering of dictionary indices is semantically meaningful
+    is_ordered: bool
+    # whether a dictionary-style mapping of categorical values to other objects
+    # exists
+    is_dictionary: bool
+    # Python-level only (e.g. ``{int: str}``).
+    # None if not a dictionary-style categorical.
+    categories: Optional[_PyArrowColumn]
+
+
+class Endianness:
+    """Enum indicating the byte-order of a data-type."""
+
+    LITTLE = "<"
+    BIG = ">"
+    NATIVE = "="
+    NA = "|"
+
+
+class NoBufferPresent(Exception):
+    """Exception to signal that there is no requested buffer."""
+
+
+class _PyArrowColumn:
+    """
+    A column object, with only the methods and properties required by the
+    interchange protocol defined.
+
+    A column can contain one or more chunks. Each chunk can contain up to three
+    buffers - a data buffer, a mask buffer (depending on null representation),
+    and an offsets buffer (if variable-size binary; e.g., variable-length
+    strings).
+
+    TBD: Arrow has a separate "null" dtype, and has no separate mask concept.
+         Instead, it seems to use "children" for both columns with a bit mask,
+         and for nested dtypes. Unclear whether this is elegant or confusing.
+         This design requires checking the null representation explicitly.
+
+         The Arrow design requires checking:
+         1. the ARROW_FLAG_NULLABLE (for sentinel values)
+         2. if a column has two children, combined with one of those children
+            having a null dtype.
+
+         Making the mask concept explicit seems useful. One null dtype would
+         not be enough to cover both bit and byte masks, so that would mean
+         even more checking if we did it the Arrow way.
+
+    TBD: there's also the "chunk" concept here, which is implicit in Arrow as
+         multiple buffers per array (= column here). Semantically it may make
+         sense to have both: chunks were meant for example for lazy evaluation
+         of data which doesn't fit in memory, while multiple buffers per column
+         could also come from doing a selection operation on a single
+         contiguous buffer.
+
+         Given these concepts, one would expect chunks to be all of the same
+         size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows),
+         while multiple buffers could have data-dependent lengths. Not an issue
+         in pandas if one column is backed by a single NumPy array, but in
+         Arrow it seems possible.
+         Are multiple chunks *and* multiple buffers per column necessary for
+         the purposes of this interchange protocol, or must producers either
+         reuse the chunk concept for this or copy the data?
+
+    Note: this Column object can only be produced by ``__dataframe__``, so
+          doesn't need its own version or ``__column__`` protocol.
+    """
+
+    def __init__(
+        self, column: pa.Array | pa.ChunkedArray, allow_copy: bool = True
+    ) -> None:
+        """
+        Handles PyArrow Arrays and ChunkedArrays.
+        """
+        # Store the column as a private attribute
+        if isinstance(column, pa.ChunkedArray):
+            if column.num_chunks == 1:
+                column = column.chunk(0)
+            else:
+                if not allow_copy:
+                    raise RuntimeError(
+                        "Chunks will be combined and a copy is required which "
+                        "is forbidden by allow_copy=False"
+                    )
+                column = column.combine_chunks()
+
+        self._allow_copy = allow_copy
+
+        if pa.types.is_boolean(column.type):
+            if not allow_copy:
+                raise RuntimeError(
+                    "Boolean column will be casted to uint8 and a copy "
+                    "is required which is forbidden by allow_copy=False"
+                )
+            self._dtype = self._dtype_from_arrowdtype(column.type, 8)
+            self._col = pc.cast(column, pa.uint8())
+        else:
+            self._col = column
+            dtype = self._col.type
+            try:
+                bit_width = dtype.bit_width
+            except ValueError:
+                # in case of a variable-length strings, considered as array
+                # of bytes (8 bits)
+                bit_width = 8
+            self._dtype = self._dtype_from_arrowdtype(dtype, bit_width)
+
+    def size(self) -> int:
+        """
+        Size of the column, in elements.
+
+        Corresponds to DataFrame.num_rows() if column is a single chunk;
+        equal to size of this current chunk otherwise.
+
+        Is a method rather than a property because it may cause a (potentially
+        expensive) computation for some dataframe implementations.
+        """
+        return len(self._col)
+
+    @property
+    def offset(self) -> int:
+        """
+        Offset of first element.
+
+        May be > 0 if using chunks; for example for a column with N chunks of
+        equal size M (only the last chunk may be shorter),
+        ``offset = n * M``, ``n = 0 .. N-1``.
+        """
+        return self._col.offset
+
+    @property
+    def dtype(self) -> Tuple[DtypeKind, int, str, str]:
+        """
+        Dtype description as a tuple ``(kind, bit-width, format string,
+        endianness)``.
+
+        Bit-width : the number of bits as an integer
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
+        Endianness : current only native endianness (``=``) is supported
+
+        Notes:
+            - Kind specifiers are aligned with DLPack where possible (hence the
+              jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1 (for
+              bit masks) or 8 (for byte masks).
+            - Dtype width in bits was preferred over bytes
+            - Endianness isn't too useful, but included now in case in the
+              future we need to support non-native endianness
+            - Went with Apache Arrow format strings over NumPy format strings
+              because they're more complete from a dataframe perspective
+            - Format strings are mostly useful for datetime specification, and
+              for categoricals.
+            - For categoricals, the format string describes the type of the
+              categorical in the data buffer. In case of a separate encoding of
+              the categorical (e.g. an integer to string mapping), this can
+              be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null, binary,
+              decimal, and nested (list, struct, map, union) dtypes.
+        """
+        return self._dtype
+
+    def _dtype_from_arrowdtype(
+        self, dtype: pa.DataType, bit_width: int
+    ) -> Tuple[DtypeKind, int, str, str]:
+        """
+        See `self.dtype` for details.
+        """
+        # Note: 'c' (complex) not handled yet (not in array spec v1).
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
+        #       not handled datetime and timedelta both map to datetime
+        #       (is timedelta handled?)
+
+        if pa.types.is_timestamp(dtype):
+            kind = DtypeKind.DATETIME
+            ts = dtype.unit[0]
+            tz = dtype.tz if dtype.tz else ""
+            f_string = "ts{ts}:{tz}".format(ts=ts, tz=tz)
+            return kind, bit_width, f_string, Endianness.NATIVE
+        elif pa.types.is_dictionary(dtype):
+            kind = DtypeKind.CATEGORICAL
+            f_string = "L"
+            return kind, bit_width, f_string, Endianness.NATIVE
+        else:
+            kind, f_string = _PYARROW_KINDS.get(dtype, (None, None))
+            if kind is None:
+                raise ValueError(
+                    f"Data type {dtype} not supported by interchange protocol")
+
+            return kind, bit_width, f_string, Endianness.NATIVE
+
+    @property
+    def describe_categorical(self) -> CategoricalDescription:
+        """
+        If the dtype is categorical, there are two options:
+        - There are only values in the data buffer.
+        - There is a separate non-categorical Column encoding categorical
+          values.
+
+        Raises TypeError if the dtype is not categorical
+
+        Returns the dictionary with description on how to interpret the
+        data buffer:
+            - "is_ordered" : bool, whether the ordering of dictionary indices
+                             is semantically meaningful.
+            - "is_dictionary" : bool, whether a mapping of
+                                categorical values to other objects exists
+            - "categories" : Column representing the (implicit) mapping of
+                             indices to category values (e.g. an array of
+                             cat1, cat2, ...). None if not a dictionary-style
+                             categorical.
+
+        TBD: are there any other in-memory representations that are needed?
+        """
+        arr = self._col
+        if not pa.types.is_dictionary(arr.type):
+            raise TypeError(
+                "describe_categorical only works on a column with "
+                "categorical dtype!"
+            )
+
+        return {
+            "is_ordered": self._col.type.ordered,
+            "is_dictionary": True,
+            "categories": _PyArrowColumn(arr.dictionary),
+        }
+
+    @property
+    def describe_null(self) -> Tuple[ColumnNullType, Any]:
+        """
+        Return the missing value (or "null") representation the column dtype
+        uses, as a tuple ``(kind, value)``.
+
+        Value : if kind is "sentinel value", the actual value. If kind is a bit
+        mask or a byte mask, the value (0 or 1) indicating a missing value.
+        None otherwise.
+        """
+        # In case of no missing values, we need to set ColumnNullType to
+        # non nullable as in the current __dataframe__ protocol bit/byte masks
+        # can not be None
+        if self.null_count == 0:
+            return ColumnNullType.NON_NULLABLE, None
+        else:
+            return ColumnNullType.USE_BITMASK, 0
+
+    @property
+    def null_count(self) -> int:
+        """
+        Number of null elements, if known.
+
+        Note: Arrow uses -1 to indicate "unknown", but None seems cleaner.
+        """
+        arrow_null_count = self._col.null_count
+        n = arrow_null_count if arrow_null_count != -1 else None
+        return n
+
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        """
+        The metadata for the column. See `DataFrame.metadata` for more details.
+        """
+        pass
+
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the column consists of.
+        """
+        return 1
+
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable[_PyArrowColumn]:
+        """
+        Return an iterator yielding the chunks.
+
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        if n_chunks and n_chunks > 1:
+            chunk_size = self.size() // n_chunks
+            if self.size() % n_chunks != 0:
+                chunk_size += 1
+
+            array = self._col
+            i = 0
+            for start in range(0, chunk_size * n_chunks, chunk_size):
+                yield _PyArrowColumn(
+                    array.slice(start, chunk_size), self._allow_copy
+                )
+                i += 1
+        else:
+            yield self
+
+    def get_buffers(self) -> ColumnBuffers:
+        """
+        Return a dictionary containing the underlying buffers.
+
+        The returned dictionary has the following contents:
+
+            - "data": a two-element tuple whose first element is a buffer
+                      containing the data and whose second element is the data
+                      buffer's associated dtype.
+            - "validity": a two-element tuple whose first element is a buffer
+                          containing mask values indicating missing data and
+                          whose second element is the mask value buffer's
+                          associated dtype. None if the null representation is
+                          not a bit or byte mask.
+            - "offsets": a two-element tuple whose first element is a buffer
+                         containing the offset values for variable-size binary
+                         data (e.g., variable-length strings) and whose second
+                         element is the offsets buffer's associated dtype. None
+                         if the data buffer does not have an associated offsets
+                         buffer.
+        """
+        buffers: ColumnBuffers = {
+            "data": self._get_data_buffer(),
+            "validity": None,
+            "offsets": None,
+        }
+
+        try:
+            buffers["validity"] = self._get_validity_buffer()
+        except NoBufferPresent:
+            pass
+
+        try:
+            buffers["offsets"] = self._get_offsets_buffer()
+        except NoBufferPresent:
+            pass
+
+        return buffers
+
+    def _get_data_buffer(
+        self,
+    ) -> Tuple[_PyArrowBuffer, Any]:  # Any is for self.dtype tuple
+        """
+        Return the buffer containing the data and the buffer's
+        associated dtype.
+        """
+        array = self._col
+        dtype = self.dtype
+
+        # In case of dictionary arrays, use indices
+        # to define a buffer, codes are transferred through
+        # describe_categorical()
+        if pa.types.is_dictionary(array.type):
+            array = array.indices
+            dtype = _PyArrowColumn(array).dtype
+
+        n = len(array.buffers())
+        if n == 2:
+            return _PyArrowBuffer(array.buffers()[1]), dtype
+        elif n == 3:
+            return _PyArrowBuffer(array.buffers()[2]), dtype
+
+    def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
+        """
+        Return the buffer containing the mask values indicating missing data
+        and the buffer's associated dtype.
+        Raises NoBufferPresent if null representation is not a bit or byte
+        mask.
+        """
+        # Define the dtype of the returned buffer
+        dtype = (DtypeKind.BOOL, 1, "b", Endianness.NATIVE)
+        array = self._col
+        buff = array.buffers()[0]
+        if buff:
+            return _PyArrowBuffer(buff), dtype
+        else:
+            raise NoBufferPresent(
+                "There are no missing values so "
+                "does not have a separate mask")
+
+    def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]:
+        """
+        Return the buffer containing the offset values for variable-size binary
+        data (e.g., variable-length strings) and the buffer's associated dtype.
+        Raises NoBufferPresent if the data buffer does not have an associated
+        offsets buffer.
+        """
+        array = self._col
+        n = len(array.buffers())
+        if n == 2:
+            raise NoBufferPresent(
+                "This column has a fixed-length dtype so "
+                "it does not have an offsets buffer"
+            )
+        elif n == 3:
+            # Define the dtype of the returned buffer
+            dtype = self._col.type
+            if pa.types.is_large_string(dtype):
+                dtype = (DtypeKind.INT, 64, "l", Endianness.NATIVE)
+            else:
+                dtype = (DtypeKind.INT, 32, "i", Endianness.NATIVE)
+            return _PyArrowBuffer(array.buffers()[1]), dtype
diff --git a/python/pyarrow/interchange/dataframe.py b/python/pyarrow/interchange/dataframe.py
new file mode 100644
index 0000000000..d0717e02e8
--- /dev/null
+++ b/python/pyarrow/interchange/dataframe.py
@@ -0,0 +1,202 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+from typing import (
+    Any,
+    Iterable,
+    Optional,
+    Sequence,
+)
+
+import pyarrow as pa
+
+from pyarrow.interchange.column import _PyArrowColumn
+
+
+class _PyArrowDataFrame:
+    """
+    A data frame class, with only the methods required by the interchange
+    protocol defined.
+
+    A "data frame" represents an ordered collection of named columns.
+    A column's "name" must be a unique string.
+    Columns may be accessed by name or by position.
+
+    This could be a public data frame class, or an object with the methods and
+    attributes defined on this DataFrame class could be returned from the
+    ``__dataframe__`` method of a public data frame class in a library adhering
+    to the dataframe interchange protocol specification.
+    """
+
+    def __init__(
+        self, df: pa.Table, nan_as_null: bool = False, allow_copy: bool = True
+    ) -> None:
+        """
+        Constructor - an instance of this (private) class is returned from
+        `pa.Table.__dataframe__`.
+        """
+        self._df = df
+        # ``nan_as_null`` is a keyword intended for the consumer to tell the
+        # producer to overwrite null values in the data with ``NaN`` (or
+        # ``NaT``).
+        if nan_as_null is True:
+            raise RuntimeError(
+                "nan_as_null=True currently has no effect, "
+                "use the default nan_as_null=False"
+            )
+        self._nan_as_null = nan_as_null
+        self._allow_copy = allow_copy
+
+    def __dataframe__(
+        self, nan_as_null: bool = False, allow_copy: bool = True
+    ) -> _PyArrowDataFrame:
+        """
+        Construct a new exchange object, potentially changing the parameters.
+        ``nan_as_null`` is a keyword intended for the consumer to tell the
+        producer to overwrite null values in the data with ``NaN``.
+        It is intended for cases where the consumer does not support the bit
+        mask or byte mask that is the producer's native representation.
+        ``allow_copy`` is a keyword that defines whether or not the library is
+        allowed to make a copy of the data. For example, copying data would be
+        necessary if a library supports strided buffers, given that this
+        protocol specifies contiguous buffers.
+        """
+        return _PyArrowDataFrame(self._df, nan_as_null, allow_copy)
+
+    @property
+    def metadata(self) -> dict[str, Any]:
+        """
+        The metadata for the data frame, as a dictionary with string keys. The
+        contents of `metadata` may be anything, they are meant for a library
+        to store information that it needs to, e.g., roundtrip losslessly or
+        for two implementations to share data that is not (yet) part of the
+        interchange protocol specification. For avoiding collisions with other
+        entries, please add name the keys with the name of the library
+        followed by a period and the desired name, e.g, ``pandas.indexcol``.
+        """
+        # The metadata for the data frame, as a dictionary with string keys.
+        # Add schema metadata here (pandas metadata or custom metadata)
+        if self._df.schema.metadata:
+            schema_metadata = {"pyarrow." + k.decode('utf8'): v.decode('utf8')
+                               for k, v in self._df.schema.metadata.items()}
+            return schema_metadata
+        else:
+            return {}
+
+    def num_columns(self) -> int:
+        """
+        Return the number of columns in the DataFrame.
+        """
+        return self._df.num_columns
+
+    def num_rows(self) -> int:
+        """
+        Return the number of rows in the DataFrame, if available.
+        """
+        return self._df.num_rows
+
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the DataFrame consists of.
+        """
+        # pyarrow.Table can have columns with different number
+        # of chunks so we take the number of chunks that
+        # .to_batches() returns as it takes the min chunk size
+        # of all the columns (to_batches is a zero copy method)
+        batches = self._df.to_batches()
+        return len(batches)
+
+    def column_names(self) -> Iterable[str]:
+        """
+        Return an iterator yielding the column names.
+        """
+        return self._df.column_names
+
+    def get_column(self, i: int) -> _PyArrowColumn:
+        """
+        Return the column at the indicated position.
+        """
+        return _PyArrowColumn(self._df.column(i),
+                              allow_copy=self._allow_copy)
+
+    def get_column_by_name(self, name: str) -> _PyArrowColumn:
+        """
+        Return the column whose name is the indicated name.
+        """
+        return _PyArrowColumn(self._df.column(name),
+                              allow_copy=self._allow_copy)
+
+    def get_columns(self) -> Iterable[_PyArrowColumn]:
+        """
+        Return an iterator yielding the columns.
+        """
+        return [
+            _PyArrowColumn(col, allow_copy=self._allow_copy)
+            for col in self._df.columns
+        ]
+
+    def select_columns(self, indices: Sequence[int]) -> _PyArrowDataFrame:
+        """
+        Create a new DataFrame by selecting a subset of columns by index.
+        """
+        return _PyArrowDataFrame(
+            self._df.select(list(indices)), self._nan_as_null, self._allow_copy
+        )
+
+    def select_columns_by_name(
+        self, names: Sequence[str]
+    ) -> _PyArrowDataFrame:
+        """
+        Create a new DataFrame by selecting a subset of columns by name.
+        """
+        return _PyArrowDataFrame(
+            self._df.select(list(names)), self._nan_as_null, self._allow_copy
+        )
+
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable[_PyArrowDataFrame]:
+        """
+        Return an iterator yielding the chunks.
+
+        By default (None), yields the chunks that the data is stored as by the
+        producer. If given, ``n_chunks`` must be a multiple of
+        ``self.num_chunks()``, meaning the producer must subdivide each chunk
+        before yielding it.
+
+        Note that the producer must ensure that all columns are chunked the
+        same way.
+        """
+        if n_chunks and n_chunks > 1:
+            chunk_size = self.num_rows() // n_chunks
+            if self.num_rows() % n_chunks != 0:
+                chunk_size += 1
+            batches = self._df.to_batches(max_chunksize=chunk_size)
+            # In case when the size of the chunk is such that the resulting
+            # list is one less chunk then n_chunks -> append an empty chunk
+            if len(batches) == n_chunks - 1:
+                batches.append(pa.record_batch([[]], schema=self._df.schema))
+        else:
+            batches = self._df.to_batches()
+
+        iterator_tables = [_PyArrowDataFrame(
+            pa.Table.from_batches([batch]), self._nan_as_null, self._allow_copy
+        )
+            for batch in batches
+        ]
+        return iterator_tables
diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py
new file mode 100644
index 0000000000..204530a335
--- /dev/null
+++ b/python/pyarrow/interchange/from_dataframe.py
@@ -0,0 +1,567 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import annotations
+
+from typing import (
+    Any,
+)
+
+from pyarrow.interchange.column import (
+    DtypeKind,
+    ColumnBuffers,
+    ColumnNullType,
+)
+
+import pyarrow as pa
+import re
+
+import pyarrow.compute as pc
+from pyarrow.interchange.column import Dtype
+
+
+# A typing protocol could be added later to let Mypy validate code using
+# `from_dataframe` better.
+DataFrameObject = Any
+ColumnObject = Any
+BufferObject = Any
+
+
+_PYARROW_DTYPES: dict[DtypeKind, dict[int, Any]] = {
+    DtypeKind.INT: {8: pa.int8(),
+                    16: pa.int16(),
+                    32: pa.int32(),
+                    64: pa.int64()},
+    DtypeKind.UINT: {8: pa.uint8(),
+                     16: pa.uint16(),
+                     32: pa.uint32(),
+                     64: pa.uint64()},
+    DtypeKind.FLOAT: {16: pa.float16(),
+                      32: pa.float32(),
+                      64: pa.float64()},
+    DtypeKind.BOOL: {8: pa.uint8()},
+    DtypeKind.STRING: {8: pa.string()},
+}
+
+
+def from_dataframe(df: DataFrameObject, allow_copy=True) -> pa.Table:
+    """
+    Build a ``pa.Table`` from any DataFrame supporting the interchange
+    protocol.
+
+    Parameters
+    ----------
+    df : DataFrameObject
+        Object supporting the interchange protocol, i.e. `__dataframe__`
+        method.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    pa.Table
+    """
+    if isinstance(df, pa.Table):
+        return df
+
+    if not hasattr(df, "__dataframe__"):
+        raise ValueError("`df` does not support __dataframe__")
+
+    return _from_dataframe(df.__dataframe__(allow_copy=allow_copy),
+                           allow_copy=allow_copy)
+
+
+def _from_dataframe(df: DataFrameObject, allow_copy=True):
+    """
+    Build a ``pa.Table`` from the DataFrame interchange object.
+
+    Parameters
+    ----------
+    df : DataFrameObject
+        Object supporting the interchange protocol, i.e. `__dataframe__`
+        method.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    pa.Table
+    """
+    batches = []
+    for chunk in df.get_chunks():
+        batch = protocol_df_chunk_to_pyarrow(chunk, allow_copy)
+        batches.append(batch)
+
+    table = pa.Table.from_batches(batches)
+    return table
+
+
+def protocol_df_chunk_to_pyarrow(
+    df: DataFrameObject,
+    allow_copy: bool = True
+) -> pa.RecordBatch:
+    """
+    Convert interchange protocol chunk to ``pa.RecordBatch``.
+
+    Parameters
+    ----------
+    df : DataFrameObject
+        Object supporting the interchange protocol, i.e. `__dataframe__`
+        method.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    pa.RecordBatch
+    """
+    # We need a dict of columns here, with each column being a pa.Array
+    columns: dict[str, pa.Array] = {}
+    for name in df.column_names():
+        if not isinstance(name, str):
+            raise ValueError(f"Column {name} is not a string")
+        if name in columns:
+            raise ValueError(f"Column {name} is not unique")
+        col = df.get_column_by_name(name)
+        dtype = col.dtype[0]
+        if dtype in (
+            DtypeKind.INT,
+            DtypeKind.UINT,
+            DtypeKind.FLOAT,
+            DtypeKind.STRING,
+            DtypeKind.DATETIME,
+        ):
+            columns[name] = column_to_array(col, allow_copy)
+        elif dtype == DtypeKind.BOOL:
+            columns[name] = bool_column_to_array(col, allow_copy)
+        elif dtype == DtypeKind.CATEGORICAL:
+            columns[name] = categorical_column_to_dictionary(col, allow_copy)
+        else:
+            raise NotImplementedError(f"Data type {dtype} not handled yet")
+
+    return pa.RecordBatch.from_pydict(columns)
+
+
+def column_to_array(
+    col: ColumnObject,
+    allow_copy: bool = True,
+) -> pa.Array:
+    """
+    Convert a column holding one of the primitive dtypes to a PyArrow array.
+    A primitive type is one of: int, uint, float, bool (1 bit).
+
+    Parameters
+    ----------
+    col : ColumnObject
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    pa.Array
+    """
+    buffers = col.get_buffers()
+    data = buffers_to_array(buffers, col.size(),
+                            col.describe_null,
+                            col.offset,
+                            allow_copy)
+    return data
+
+
+def bool_column_to_array(
+    col: ColumnObject,
+    allow_copy: bool = True,
+) -> pa.Array:
+    """
+    Convert a column holding boolean dtype to a PyArrow array.
+
+    Parameters
+    ----------
+    col : ColumnObject
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    pa.Array
+    """
+    if not allow_copy:
+        raise RuntimeError(
+            "Boolean column will be casted from uint8 and a copy "
+            "is required which is forbidden by allow_copy=False"
+        )
+
+    buffers = col.get_buffers()
+    data = buffers_to_array(buffers, col.size(),
+                            col.describe_null,
+                            col.offset)
+    data = pc.cast(data, pa.bool_())
+
+    return data
+
+
+def categorical_column_to_dictionary(
+    col: ColumnObject,
+    allow_copy: bool = True,
+) -> pa.DictionaryArray:
+    """
+    Convert a column holding categorical data to a pa.DictionaryArray.
+
+    Parameters
+    ----------
+    col : ColumnObject
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    pa.DictionaryArray
+    """
+    if not allow_copy:
+        raise RuntimeError(
+            "Categorical column will be casted from uint8 and a copy "
+            "is required which is forbidden by allow_copy=False"
+        )
+
+    categorical = col.describe_categorical
+
+    if not categorical["is_dictionary"]:
+        raise NotImplementedError(
+            "Non-dictionary categoricals not supported yet")
+
+    cat_column = categorical["categories"]
+    dictionary = column_to_array(cat_column)
+
+    buffers = col.get_buffers()
+    indices = buffers_to_array(buffers, col.size(),
+                               col.describe_null,
+                               col.offset)
+
+    # Constructing a pa.DictionaryArray
+    dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
+
+    return dict_array
+
+
+def parse_datetime_format_str(format_str):
+    """Parse datetime `format_str` to interpret the `data`."""
+
+    # timestamp 'ts{unit}:tz'
+    timestamp_meta = re.match(r"ts([smun]):(.*)", format_str)
+    if timestamp_meta:
+        unit, tz = timestamp_meta.group(1), timestamp_meta.group(2)
+        if unit != "s":
+            # the format string describes only a first letter of the unit, so
+            # add one extra letter to convert the unit to numpy-style:
+            # 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns'
+            unit += "s"
+
+        return unit, tz
+
+    raise NotImplementedError(f"DateTime kind is not supported: {format_str}")
+
+
+def map_date_type(data_type):
+    """Map column date type to pyarrow date type. """
+    kind, bit_width, f_string, _ = data_type
+
+    if kind == DtypeKind.DATETIME:
+        unit, tz = parse_datetime_format_str(f_string)
+        return pa.timestamp(unit, tz=tz)
+    else:
+        pa_dtype = _PYARROW_DTYPES.get(kind, {}).get(bit_width, None)
+
+        # Error if dtype is not supported
+        if pa_dtype:
+            return pa_dtype
+        else:
+            raise NotImplementedError(
+                f"Conversion for {data_type} is not yet supported.")
+
+
+def buffers_to_array(
+    buffers: ColumnBuffers,
+    length: int,
+    describe_null: ColumnNullType,
+    offset: int = 0,
+    allow_copy: bool = True,
+) -> pa.Array:
+    """
+    Build a PyArrow array from the passed buffer.
+
+    Parameters
+    ----------
+    buffer : ColumnBuffers
+        Dictionary containing tuples of underlying buffers and
+        their associated dtype.
+    length : int
+        The number of values in the array.
+    describe_null: ColumnNullType
+        Null representation the column dtype uses,
+        as a tuple ``(kind, value)``
+    offset : int, default: 0
+        Number of elements to offset from the start of the buffer.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    pa.Array
+
+    Notes
+    -----
+    The returned array doesn't own the memory. The caller of this function
+    is responsible for keeping the memory owner object alive as long as
+    the returned PyArrow array is being used.
+    """
+    data_buff, data_type = buffers["data"]
+    try:
+        validity_buff, validity_dtype = buffers["validity"]
+    except TypeError:
+        validity_buff = None
+    try:
+        offset_buff, offset_dtype = buffers["offsets"]
+    except TypeError:
+        offset_buff = None
+
+    # Construct a pyarrow Buffer
+    data_pa_buffer = pa.foreign_buffer(data_buff.ptr, data_buff.bufsize,
+                                       base=data_buff)
+
+    # Construct a validity pyarrow Buffer, if applicable
+    if validity_buff:
+        validity_pa_buff = validity_buffer_from_mask(validity_buff,
+                                                     validity_dtype,
+                                                     describe_null,
+                                                     length,
+                                                     offset,
+                                                     allow_copy)
+    else:
+        validity_pa_buff = validity_buffer_nan_sentinel(data_pa_buffer,
+                                                        data_type,
+                                                        describe_null,
+                                                        length,
+                                                        offset,
+                                                        allow_copy)
+
+    # Construct a pyarrow Array from buffers
+    data_dtype = map_date_type(data_type)
+
+    if offset_buff:
+        _, offset_bit_width, _, _ = offset_dtype
+        # If an offset buffer exists, construct an offset pyarrow Buffer
+        # and add it to the construction of an array
+        offset_pa_buffer = pa.foreign_buffer(offset_buff.ptr,
+                                             offset_buff.bufsize,
+                                             base=offset_buff)
+
+        if data_type[2] == 'U':
+            string_type = pa.large_string()
+        else:
+            if offset_bit_width == 64:
+                string_type = pa.large_string()
+            else:
+                string_type = pa.string()
+        array = pa.Array.from_buffers(
+            string_type,
+            length,
+            [validity_pa_buff, offset_pa_buffer, data_pa_buffer],
+            offset=offset,
+        )
+    else:
+        array = pa.Array.from_buffers(
+            data_dtype,
+            length,
+            [validity_pa_buff, data_pa_buffer],
+            offset=offset,
+        )
+
+    return array
+
+
+def validity_buffer_from_mask(
+    validity_buff: BufferObject,
+    validity_dtype: Dtype,
+    describe_null: ColumnNullType,
+    length: int,
+    offset: int = 0,
+    allow_copy: bool = True,
+) -> pa.Buffer:
+    """
+    Build a PyArrow buffer from the passed mask buffer.
+
+    Parameters
+    ----------
+    validity_buff : BufferObject
+        Tuple of underlying validity buffer and associated dtype.
+    validity_dtype : Dtype
+        Dtype description as a tuple ``(kind, bit-width, format string,
+        endianness)``.
+    describe_null : ColumnNullType
+        Null representation the column dtype uses,
+        as a tuple ``(kind, value)``
+    length : int
+        The number of values in the array.
+    offset : int, default: 0
+        Number of elements to offset from the start of the buffer.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    pa.Buffer
+    """
+    null_kind, sentinel_val = describe_null
+    validity_kind, _, _, _ = validity_dtype
+    assert validity_kind == DtypeKind.BOOL
+
+    if null_kind == ColumnNullType.NON_NULLABLE:
+        # Sliced array can have a NON_NULLABLE ColumnNullType due
+        # to no missing values in that slice of an array though the bitmask
+        # exists and validity_buff must be set to None in this case
+        return None
+
+    elif null_kind == ColumnNullType.USE_BYTEMASK or (
+        null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 1
+    ):
+        buff = pa.foreign_buffer(validity_buff.ptr,
+                                 validity_buff.bufsize,
+                                 base=validity_buff)
+
+        if null_kind == ColumnNullType.USE_BYTEMASK:
+            if not allow_copy:
+                raise RuntimeError(
+                    "To create a bitmask a copy of the data is "
+                    "required which is forbidden by allow_copy=False"
+                )
+            mask = pa.Array.from_buffers(pa.int8(), length,
+                                         [None, buff],
+                                         offset=offset)
+            mask_bool = pc.cast(mask, pa.bool_())
+        else:
+            mask_bool = pa.Array.from_buffers(pa.bool_(), length,
+                                              [None, buff],
+                                              offset=offset)
+
+        if sentinel_val == 1:
+            mask_bool = pc.invert(mask_bool)
+
+        return mask_bool.buffers()[1]
+
+    elif null_kind == ColumnNullType.USE_BITMASK and sentinel_val == 0:
+        return pa.foreign_buffer(validity_buff.ptr,
+                                 validity_buff.bufsize,
+                                 base=validity_buff)
+    else:
+        raise NotImplementedError(
+            f"{describe_null} null representation is not yet supported.")
+
+
+def validity_buffer_nan_sentinel(
+    data_pa_buffer: BufferObject,
+    data_type: Dtype,
+    describe_null: ColumnNullType,
+    length: int,
+    offset: int = 0,
+    allow_copy: bool = True,
+) -> pa.Buffer:
+    """
+    Build a PyArrow buffer from NaN or sentinel values.
+
+    Parameters
+    ----------
+    data_pa_buffer : pa.Buffer
+        PyArrow buffer for the column data.
+    data_type : Dtype
+        Dtype description as a tuple ``(kind, bit-width, format string,
+        endianness)``.
+    describe_null : ColumnNullType
+        Null representation the column dtype uses,
+        as a tuple ``(kind, value)``
+    length : int
+        The number of values in the array.
+    offset : int, default: 0
+        Number of elements to offset from the start of the buffer.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    pa.Buffer
+    """
+    kind, bit_width, _, _ = data_type
+    data_dtype = map_date_type(data_type)
+    null_kind, sentinel_val = describe_null
+
+    # Check for float NaN values
+    if null_kind == ColumnNullType.USE_NAN:
+        if not allow_copy:
+            raise RuntimeError(
+                "To create a bitmask a copy of the data is "
+                "required which is forbidden by allow_copy=False"
+            )
+
+        if kind == DtypeKind.FLOAT and bit_width == 16:
+            # 'pyarrow.compute.is_nan' kernel not yet implemented
+            # for float16
+            raise NotImplementedError(
+                f"{data_type} with {null_kind} is not yet supported.")
+        else:
+            pyarrow_data = pa.Array.from_buffers(
+                data_dtype,
+                length,
+                [None, data_pa_buffer],
+                offset=offset,
+            )
+            mask = pc.is_nan(pyarrow_data)
+            mask = pc.invert(mask)
+            return mask.buffers()[1]
+
+    # Check for sentinel values
+    elif null_kind == ColumnNullType.USE_SENTINEL:
+        if not allow_copy:
+            raise RuntimeError(
+                "To create a bitmask a copy of the data is "
+                "required which is forbidden by allow_copy=False"
+            )
+
+        if kind == DtypeKind.DATETIME:
+            sentinel_dtype = pa.int64()
+        else:
+            sentinel_dtype = data_dtype
+        pyarrow_data = pa.Array.from_buffers(sentinel_dtype,
+                                             length,
+                                             [None, data_pa_buffer],
+                                             offset=offset)
+        sentinel_arr = pc.equal(pyarrow_data, sentinel_val)
+        mask_bool = pc.invert(sentinel_arr)
+        return mask_bool.buffers()[1]
+
+    elif null_kind == ColumnNullType.NON_NULLABLE:
+        pass
+    else:
+        raise NotImplementedError(
+            f"{describe_null} null representation is not yet supported.")
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index c97ad027dd..35492d4f64 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -2861,6 +2861,36 @@ cdef class Table(_PandasConvertible):
 
         return self.column(key)
 
+    # ----------------------------------------------------------------------
+    def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
+        """
+        Return the dataframe interchange object implementing the interchange protocol.
+        Parameters
+        ----------
+        nan_as_null : bool, default False
+            Whether to tell the DataFrame to overwrite null values in the data
+            with ``NaN`` (or ``NaT``).
+        allow_copy : bool, default True
+            Whether to allow memory copying when exporting. If set to False
+            it would cause non-zero-copy exports to fail.
+        Returns
+        -------
+        DataFrame interchange object
+            The object which consuming library can use to ingress the dataframe.
+        Notes
+        -----
+        Details on the interchange protocol:
+        https://data-apis.org/dataframe-protocol/latest/index.html
+        `nan_as_null` currently has no effect; once support for nullable extension
+        dtypes is added, this value should be propagated to columns.
+        """
+
+        from pyarrow.interchange.dataframe import _PyArrowDataFrame
+
+        return _PyArrowDataFrame(self, nan_as_null, allow_copy)
+
+    # ----------------------------------------------------------------------
+
     def slice(self, offset=0, length=None):
         """
         Compute zero-copy slice of this Table.
diff --git a/python/pyarrow/tests/interchange/__init__.py b/python/pyarrow/tests/interchange/__init__.py
new file mode 100644
index 0000000000..13a83393a9
--- /dev/null
+++ b/python/pyarrow/tests/interchange/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py
new file mode 100644
index 0000000000..0680d9c4ec
--- /dev/null
+++ b/python/pyarrow/tests/interchange/test_conversion.py
@@ -0,0 +1,524 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from datetime import datetime as dt
+import numpy as np
+import pyarrow as pa
+from pyarrow.vendored.version import Version
+import pytest
+
+import pyarrow.interchange as pi
+from pyarrow.interchange.column import (
+    _PyArrowColumn,
+    ColumnNullType,
+    DtypeKind,
+)
+from pyarrow.interchange.from_dataframe import _from_dataframe
+
+try:
+    import pandas as pd
+    # import pandas.testing as tm
+except ImportError:
+    pass
+
+
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+@pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30'])
+def test_datetime(unit, tz):
+    dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), None]
+    table = pa.table({"A": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))})
+    col = table.__dataframe__().get_column_by_name("A")
+
+    assert col.size() == 3
+    assert col.offset == 0
+    assert col.null_count == 1
+    assert col.dtype[0] == DtypeKind.DATETIME
+    assert col.describe_null == (ColumnNullType.USE_BITMASK, 0)
+
+
+@pytest.mark.parametrize(
+    ["test_data", "kind"],
+    [
+        (["foo", "bar"], 21),
+        ([1.5, 2.5, 3.5], 2),
+        ([1, 2, 3, 4], 0),
+    ],
+)
+def test_array_to_pyarrowcolumn(test_data, kind):
+    arr = pa.array(test_data)
+    arr_column = _PyArrowColumn(arr)
+
+    assert arr_column._col == arr
+    assert arr_column.size() == len(test_data)
+    assert arr_column.dtype[0] == kind
+    assert arr_column.num_chunks() == 1
+    assert arr_column.null_count == 0
+    assert arr_column.get_buffers()["validity"] is None
+    assert len(list(arr_column.get_chunks())) == 1
+
+    for chunk in arr_column.get_chunks():
+        assert chunk == arr_column
+
+
+def test_offset_of_sliced_array():
+    arr = pa.array([1, 2, 3, 4])
+    arr_sliced = arr.slice(2, 2)
+
+    table = pa.table([arr], names=["arr"])
+    table_sliced = pa.table([arr_sliced], names=["arr_sliced"])
+
+    col = table_sliced.__dataframe__().get_column(0)
+    assert col.offset == 2
+
+    result = _from_dataframe(table_sliced.__dataframe__())
+    assert table_sliced.equals(result)
+    assert not table.equals(result)
+
+    # pandas hardcodes offset to 0:
+    # https://github.com/pandas-dev/pandas/blob/5c66e65d7b9fef47ccb585ce2fd0b3ea18dc82ea/pandas/core/interchange/from_dataframe.py#L247
+    # so conversion to pandas can't be tested currently
+
+    # df = pandas_from_dataframe(table)
+    # df_sliced = pandas_from_dataframe(table_sliced)
+
+    # tm.assert_series_equal(df["arr"][2:4], df_sliced["arr_sliced"],
+    #                        check_index=False, check_names=False)
+
+
+# Currently errors due to string conversion
+# as col.size is called as a property not method in pandas
+# see L255-L257 in pandas/core/interchange/from_dataframe.py
+@pytest.mark.pandas
+def test_categorical_roundtrip():
+    pytest.skip("Bug in pandas implementation")
+
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]
+    table = pa.table(
+        {"weekday": pa.array(arr).dictionary_encode()}
+    )
+
+    pandas_df = table.to_pandas()
+    result = pi.from_dataframe(pandas_df)
+
+    # Checking equality for the values
+    # As the dtype of the indices is changed from int32 in pa.Table
+    # to int64 in pandas interchange protocol implementation
+    assert result[0].chunk(0).dictionary == table[0].chunk(0).dictionary
+
+    table_protocol = table.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert table_protocol.num_columns() == result_protocol.num_columns()
+    assert table_protocol.num_rows() == result_protocol.num_rows()
+    assert table_protocol.num_chunks() == result_protocol.num_chunks()
+    assert table_protocol.column_names() == result_protocol.column_names()
+
+    col_table = table_protocol.get_column(0)
+    col_result = result_protocol.get_column(0)
+
+    assert col_result.dtype[0] == DtypeKind.CATEGORICAL
+    assert col_result.dtype[0] == col_table.dtype[0]
+    assert col_result.size == col_table.size
+    assert col_result.offset == col_table.offset
+
+    desc_cat_table = col_result.describe_categorical
+    desc_cat_result = col_result.describe_categorical
+
+    assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
+    assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
+    assert isinstance(desc_cat_result["categories"]._col, pa.Array)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize(
+    "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
+)
+@pytest.mark.parametrize(
+    "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+@pytest.mark.parametrize(
+    "float, np_float", [
+        # (pa.float16(), np.float16),   #not supported by pandas
+        (pa.float32(), np.float32),
+        (pa.float64(), np.float64)
+    ]
+)
+def test_pandas_roundtrip(uint, int, float, np_float):
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    arr = [1, 2, 3]
+    table = pa.table(
+        {
+            "a": pa.array(arr, type=uint),
+            "b": pa.array(arr, type=int),
+            "c": pa.array(np.array(arr, dtype=np_float), type=float),
+        }
+    )
+    from pandas.api.interchange import (
+        from_dataframe as pandas_from_dataframe
+    )
+    pandas_df = pandas_from_dataframe(table)
+    result = pi.from_dataframe(pandas_df)
+    assert table.equals(result)
+
+    table_protocol = table.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert table_protocol.num_columns() == result_protocol.num_columns()
+    assert table_protocol.num_rows() == result_protocol.num_rows()
+    assert table_protocol.num_chunks() == result_protocol.num_chunks()
+    assert table_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.pandas
+def test_roundtrip_pandas_string():
+    # See https://github.com/pandas-dev/pandas/issues/50554
+    if Version(pd.__version__) < Version("1.6"):
+        pytest.skip(" Column.size() called as a method in pandas 2.0.0")
+
+    # large string is not supported by pandas implementation
+    table = pa.table({"a": pa.array(["a", "", "c"])})
+
+    from pandas.api.interchange import (
+        from_dataframe as pandas_from_dataframe
+    )
+    pandas_df = pandas_from_dataframe(table)
+    result = pi.from_dataframe(pandas_df)
+
+    assert result[0].to_pylist() == table[0].to_pylist()
+    assert pa.types.is_string(table[0].type)
+    assert pa.types.is_large_string(result[0].type)
+
+    table_protocol = table.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert table_protocol.num_columns() == result_protocol.num_columns()
+    assert table_protocol.num_rows() == result_protocol.num_rows()
+    assert table_protocol.num_chunks() == result_protocol.num_chunks()
+    assert table_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.pandas
+def test_roundtrip_pandas_boolean():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    table = pa.table({"a": [True, False, True]})
+
+    from pandas.api.interchange import (
+        from_dataframe as pandas_from_dataframe
+    )
+    pandas_df = pandas_from_dataframe(table)
+    result = pi.from_dataframe(pandas_df)
+
+    assert table.equals(result)
+
+    table_protocol = table.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert table_protocol.num_columns() == result_protocol.num_columns()
+    assert table_protocol.num_rows() == result_protocol.num_rows()
+    assert table_protocol.num_chunks() == result_protocol.num_chunks()
+    assert table_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+def test_roundtrip_pandas_datetime(unit):
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+    from datetime import datetime as dt
+
+    # timezones not included as they are not yet supported in
+    # the pandas implementation
+    dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), dt(2007, 7, 15)]
+    table = pa.table({"a": pa.array(dt_arr, type=pa.timestamp(unit))})
+
+    if Version(pd.__version__) < Version("1.6"):
+        # pandas < 2.0 always creates datetime64 in "ns"
+        # resolution
+        expected = pa.table({"a": pa.array(dt_arr, type=pa.timestamp('ns'))})
+    else:
+        expected = table
+
+    from pandas.api.interchange import (
+        from_dataframe as pandas_from_dataframe
+    )
+    pandas_df = pandas_from_dataframe(table)
+    result = pi.from_dataframe(pandas_df)
+
+    assert expected.equals(result)
+
+    expected_protocol = expected.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert expected_protocol.num_columns() == result_protocol.num_columns()
+    assert expected_protocol.num_rows() == result_protocol.num_rows()
+    assert expected_protocol.num_chunks() == result_protocol.num_chunks()
+    assert expected_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.large_memory
+@pytest.mark.pandas
+def test_pandas_assertion_error_large_string():
+    # Test AssertionError as pandas does not support "U" type strings
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    data = np.array([b'x'*1024]*(3*1024**2), dtype='object')  # 3GB bytes data
+    arr = pa.array(data, type=pa.large_string())
+    table = pa.table([arr], names=["large_string"])
+
+    from pandas.api.interchange import (
+        from_dataframe as pandas_from_dataframe
+    )
+
+    with pytest.raises(AssertionError):
+        pandas_from_dataframe(table)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize(
+    "np_float", [np.float32, np.float64]
+)
+def test_pandas_to_pyarrow_with_missing(np_float):
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    np_array = np.array([0, np.nan, 2], dtype=np_float)
+    datetime_array = [None, dt(2007, 7, 14), dt(2007, 7, 15)]
+    df = pd.DataFrame({
+        "a": np_array,   # float, ColumnNullType.USE_NAN
+        "dt": datetime_array  # ColumnNullType.USE_SENTINEL
+    })
+    expected = pa.table({
+        "a": pa.array(np_array, from_pandas=True),
+        "dt": pa.array(datetime_array, type=pa.timestamp("ns"))
+    })
+    result = pi.from_dataframe(df)
+
+    assert result.equals(expected)
+
+
+@pytest.mark.pandas
+def test_pandas_to_pyarrow_float16_with_missing():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    # np.float16 errors if ps.is_nan is used
+    # pyarrow.lib.ArrowNotImplementedError: Function 'is_nan' has no kernel
+    # matching input types (halffloat)
+    np_array = np.array([0, np.nan, 2], dtype=np.float16)
+    df = pd.DataFrame({"a": np_array})
+
+    with pytest.raises(NotImplementedError):
+        pi.from_dataframe(df)
+
+
+@pytest.mark.pandas
+def test_pandas_to_pyarrow_string_with_missing():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    # pandas is using int64 offsets for string dtype so the constructed
+    # pyarrow string column will always be a large_string data type
+    arr = {
+        "Y": ["a", "b", None],  # bool, ColumnNullType.USE_BYTEMASK,
+    }
+    df = pd.DataFrame(arr)
+    expected = pa.table(arr)
+    result = pi.from_dataframe(df)
+
+    assert result[0].to_pylist() == expected[0].to_pylist()
+    assert pa.types.is_string(expected[0].type)
+    assert pa.types.is_large_string(result[0].type)
+
+
+@pytest.mark.pandas
+def test_pandas_to_pyarrow_categorical_with_missing():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None]
+    df = pd.DataFrame(
+        {"weekday": arr}
+    )
+    df = df.astype("category")
+    result = pi.from_dataframe(df)
+
+    expected_dictionary = ["Fri", "Mon", "Sat", "Thu", "Tue", "Wed"]
+    expected_indices = pa.array([1, 4, 1, 5, 1, 3, 0, 2, None], type=pa.int8())
+
+    assert result[0].to_pylist() == arr
+    assert result[0].chunk(0).dictionary.to_pylist() == expected_dictionary
+    assert result[0].chunk(0).indices.equals(expected_indices)
+
+
+@pytest.mark.parametrize(
+    "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
+)
+@pytest.mark.parametrize(
+    "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+@pytest.mark.parametrize(
+    "float, np_float", [
+        (pa.float16(), np.float16),
+        (pa.float32(), np.float32),
+        (pa.float64(), np.float64)
+    ]
+)
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+@pytest.mark.parametrize("tz", ['America/New_York', '+07:30', '-04:30'])
+@pytest.mark.parametrize("offset, length", [(0, 3), (0, 2), (1, 2), (2, 1)])
+def test_pyarrow_roundtrip(uint, int, float, np_float,
+                           unit, tz, offset, length):
+
+    from datetime import datetime as dt
+    arr = [1, 2, None]
+    dt_arr = [dt(2007, 7, 13), None, dt(2007, 7, 15)]
+
+    table = pa.table(
+        {
+            "a": pa.array(arr, type=uint),
+            "b": pa.array(arr, type=int),
+            "c": pa.array(np.array(arr, dtype=np_float),
+                          type=float, from_pandas=True),
+            "d": [True, False, True],
+            "e": [True, False, None],
+            "f": ["a", None, "c"],
+            "g": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))
+        }
+    )
+    table = table.slice(offset, length)
+    result = _from_dataframe(table.__dataframe__())
+
+    assert table.equals(result)
+
+    table_protocol = table.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert table_protocol.num_columns() == result_protocol.num_columns()
+    assert table_protocol.num_rows() == result_protocol.num_rows()
+    assert table_protocol.num_chunks() == result_protocol.num_chunks()
+    assert table_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.parametrize("offset, length", [(0, 10), (0, 2), (7, 3), (2, 1)])
+def test_pyarrow_roundtrip_categorical(offset, length):
+    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", None, "Sun"]
+    table = pa.table(
+        {"weekday": pa.array(arr).dictionary_encode()}
+    )
+    table = table.slice(offset, length)
+    result = _from_dataframe(table.__dataframe__())
+
+    assert table.equals(result)
+
+    table_protocol = table.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert table_protocol.num_columns() == result_protocol.num_columns()
+    assert table_protocol.num_rows() == result_protocol.num_rows()
+    assert table_protocol.num_chunks() == result_protocol.num_chunks()
+    assert table_protocol.column_names() == result_protocol.column_names()
+
+    col_table = table_protocol.get_column(0)
+    col_result = result_protocol.get_column(0)
+
+    assert col_result.dtype[0] == DtypeKind.CATEGORICAL
+    assert col_result.dtype[0] == col_table.dtype[0]
+    assert col_result.size() == col_table.size()
+    assert col_result.offset == col_table.offset
+
+    desc_cat_table = col_result.describe_categorical
+    desc_cat_result = col_result.describe_categorical
+
+    assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
+    assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
+    assert isinstance(desc_cat_result["categories"]._col, pa.Array)
+
+
+@pytest.mark.large_memory
+def test_pyarrow_roundtrip_large_string():
+
+    data = np.array([b'x'*1024]*(3*1024**2), dtype='object')  # 3GB bytes data
+    arr = pa.array(data, type=pa.large_string())
+    table = pa.table([arr], names=["large_string"])
+
+    result = _from_dataframe(table.__dataframe__())
+    col = result.__dataframe__().get_column(0)
+
+    assert col.size() == 3*1024**2
+    assert pa.types.is_large_string(table[0].type)
+    assert pa.types.is_large_string(result[0].type)
+
+    assert table.equals(result)
+
+
+def test_nan_as_null():
+    table = pa.table({"a": [1, 2, 3, 4]})
+    with pytest.raises(RuntimeError):
+        table.__dataframe__(nan_as_null=True)
+
+
+@pytest.mark.pandas
+def test_allow_copy_false():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    # Test that an error is raised when a copy is needed
+    # to create a bitmask
+
+    df = pd.DataFrame({"a": [0, 1.0, 2.0]})
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({
+        "dt": [None, dt(2007, 7, 14), dt(2007, 7, 15)]
+    })
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+
+@pytest.mark.pandas
+def test_allow_copy_false_bool_categorical():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    # Test that an error is raised for boolean
+    # and categorical dtype (copy is always made)
+
+    df = pd.DataFrame({"a": [None, False, True]})
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({"a": [True, False, True]})
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({"weekday": ["a", "b", None]})
+    df = df.astype("category")
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({"weekday": ["a", "b", "c"]})
+    df = df.astype("category")
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py
new file mode 100644
index 0000000000..42ec805359
--- /dev/null
+++ b/python/pyarrow/tests/interchange/test_interchange_spec.py
@@ -0,0 +1,243 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+import hypothesis as h
+import hypothesis.strategies as st
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.tests.strategies as past
+import pytest
+
+
+all_types = st.deferred(
+    lambda: (
+        past.signed_integer_types |
+        past.unsigned_integer_types |
+        past.floating_types |
+        past.bool_type |
+        past.string_type |
+        past.large_string_type
+    )
+)
+
+
+# datetime is tested in test_extra.py
+# dictionary is tested in test_categorical()
+@h.given(past.arrays(all_types, size=3))
+def test_dtypes(arr):
+    table = pa.table([arr], names=["a"])
+    df = table.__dataframe__()
+
+    null_count = df.get_column(0).null_count
+    assert null_count == arr.null_count
+    assert isinstance(null_count, int)
+    assert df.get_column(0).size() == 3
+    assert df.get_column(0).offset == 0
+
+
+@pytest.mark.parametrize(
+    "uint, uint_bw",
+    [
+        (pa.uint8(), 8),
+        (pa.uint16(), 16),
+        (pa.uint32(), 32)
+    ]
+)
+@pytest.mark.parametrize(
+    "int, int_bw", [
+        (pa.int8(), 8),
+        (pa.int16(), 16),
+        (pa.int32(), 32),
+        (pa.int64(), 64)
+    ]
+)
+@pytest.mark.parametrize(
+    "float, float_bw, np_float", [
+        (pa.float16(), 16, np.float16),
+        (pa.float32(), 32, np.float32),
+        (pa.float64(), 64, np.float64)
+    ]
+)
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+@pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30'])
+def test_mixed_dtypes(uint, uint_bw, int, int_bw,
+                      float, float_bw, np_float, unit, tz):
+    from datetime import datetime as dt
+    arr = [1, 2, 3]
+    dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), dt(2007, 7, 15)]
+    table = pa.table(
+        {
+            "a": pa.array(arr, type=uint),
+            "b": pa.array(arr, type=int),
+            "c": pa.array(np.array(arr, dtype=np_float), type=float),
+            "d": [True, False, True],
+            "e": ["a", "", "c"],
+            "f": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))
+        }
+    )
+    df = table.__dataframe__()
+    # 0 = DtypeKind.INT, 1 = DtypeKind.UINT, 2 = DtypeKind.FLOAT,
+    # 20 = DtypeKind.BOOL, 21 = DtypeKind.STRING, 22 = DtypeKind.DATETIME
+    # see DtypeKind class in column.py
+    columns = {"a": 1, "b": 0, "c": 2, "d": 20, "e": 21, "f": 22}
+
+    for column, kind in columns.items():
+        col = df.get_column_by_name(column)
+
+        assert col.null_count == 0
+        assert col.size() == 3
+        assert col.offset == 0
+        assert col.dtype[0] == kind
+
+    assert df.get_column_by_name("a").dtype[1] == uint_bw
+    assert df.get_column_by_name("b").dtype[1] == int_bw
+    assert df.get_column_by_name("c").dtype[1] == float_bw
+
+
+def test_na_float():
+    table = pa.table({"a": [1.0, None, 2.0]})
+    df = table.__dataframe__()
+    col = df.get_column_by_name("a")
+    assert col.null_count == 1
+    assert isinstance(col.null_count, int)
+
+
+def test_noncategorical():
+    table = pa.table({"a": [1, 2, 3]})
+    df = table.__dataframe__()
+    col = df.get_column_by_name("a")
+    with pytest.raises(TypeError, match=".*categorical.*"):
+        col.describe_categorical
+
+
+def test_categorical():
+    import pyarrow as pa
+    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None]
+    table = pa.table(
+        {"weekday": pa.array(arr).dictionary_encode()}
+    )
+
+    col = table.__dataframe__().get_column_by_name("weekday")
+    categorical = col.describe_categorical
+    assert isinstance(categorical["is_ordered"], bool)
+    assert isinstance(categorical["is_dictionary"], bool)
+
+
+def test_dataframe():
+    n = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+    a = pa.chunked_array([["Flamingo", "Parrot", "Cow"],
+                         ["Horse", "Brittle stars", "Centipede"]])
+    table = pa.table([n, a], names=['n_legs', 'animals'])
+    df = table.__dataframe__()
+
+    assert df.num_columns() == 2
+    assert df.num_rows() == 6
+    assert df.num_chunks() == 2
+    assert list(df.column_names()) == ['n_legs', 'animals']
+    assert list(df.select_columns((1,)).column_names()) == list(
+        df.select_columns_by_name(("animals",)).column_names()
+    )
+
+
+@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
+def test_df_get_chunks(size, n_chunks):
+    table = pa.table({"x": list(range(size))})
+    df = table.__dataframe__()
+    chunks = list(df.get_chunks(n_chunks))
+    assert len(chunks) == n_chunks
+    assert sum(chunk.num_rows() for chunk in chunks) == size
+
+
+@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
+def test_column_get_chunks(size, n_chunks):
+    table = pa.table({"x": list(range(size))})
+    df = table.__dataframe__()
+    chunks = list(df.get_column(0).get_chunks(n_chunks))
+    assert len(chunks) == n_chunks
+    assert sum(chunk.size() for chunk in chunks) == size
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize(
+    "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
+)
+@pytest.mark.parametrize(
+    "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+@pytest.mark.parametrize(
+    "float, np_float", [
+        (pa.float16(), np.float16),
+        (pa.float32(), np.float32),
+        (pa.float64(), np.float64)
+    ]
+)
+def test_get_columns(uint, int, float, np_float):
+    arr = [[1, 2, 3], [4, 5]]
+    arr_float = np.array([1, 2, 3, 4, 5], dtype=np_float)
+    table = pa.table(
+        {
+            "a": pa.chunked_array(arr, type=uint),
+            "b": pa.chunked_array(arr, type=int),
+            "c": pa.array(arr_float, type=float)
+        }
+    )
+    df = table.__dataframe__()
+    for col in df.get_columns():
+        assert col.size() == 5
+        assert col.num_chunks() == 1
+
+    # 0 = DtypeKind.INT, 1 = DtypeKind.UINT, 2 = DtypeKind.FLOAT,
+    # see DtypeKind class in column.py
+    assert df.get_column(0).dtype[0] == 1  # UINT
+    assert df.get_column(1).dtype[0] == 0  # INT
+    assert df.get_column(2).dtype[0] == 2  # FLOAT
+
+
+@pytest.mark.parametrize(
+    "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+def test_buffer(int):
+    arr = [0, 1, -1]
+    table = pa.table({"a": pa.array(arr, type=int)})
+    df = table.__dataframe__()
+    col = df.get_column(0)
+    buf = col.get_buffers()
+
+    dataBuf, dataDtype = buf["data"]
+
+    assert dataBuf.bufsize > 0
+    assert dataBuf.ptr != 0
+    device, _ = dataBuf.__dlpack_device__()
+
+    # 0 = DtypeKind.INT
+    # see DtypeKind class in column.py
+    assert dataDtype[0] == 0
+
+    if device == 1:  # CPU-only as we're going to directly read memory here
+        bitwidth = dataDtype[1]
+        ctype = {
+            8: ctypes.c_int8,
+            16: ctypes.c_int16,
+            32: ctypes.c_int32,
+            64: ctypes.c_int64,
+        }[bitwidth]
+
+        for idx, truth in enumerate(arr):
+            val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value
+            assert val == truth, f"Buffer at index {idx} mismatch"